Aim: To test the association between age-related macular degeneration (AMD) with epigenetic age.

Pre-processing script adapted from Chaini Konwar.


library(GEOquery)
library(RCurl)
library(GEOmetadb)
library(dendextend)
library(ArrayExpress)
library(methylumi)
library(lumi)
library(lattice)
library(gplots)
library(RColorBrewer)
library(limma)
library(ROC)
library(matrixStats)
library(reshape)
library(sva)
library(grid)
library(gridExtra)
library(ape)
library(Hmisc)
library(RCurl)
library(wateRmelon)
library(minfiData)
library(minfi)
library(robustHD)
library(ewastools)
library(omicsPrint)
library(doParallel)
library(jcolors)
library(plyr)
library(tidyverse)
library(ggrepel)
library(ggpubr)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
library(IlluminaHumanMethylationEPICmanifest)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kmanifest)
library(FlowSorted.Blood.EPIC)
library(FlowSorted.Blood.450k)
library(FlowSorted.CordBloodCombined.450k)
library(Biobase)
library(data.table)
library(factoextra)
library(Metrics)
library(quantro)
library(impute)

setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/")

Data exploration and object creation

Step 1: Explore data downloaded from ArrayExpress.

The data used here comes from Whole-genome methylation profiling of the retinal pigment epithelium of individuals with age-related macular degeneration reveals differential methylation of the SKI, GTF2H4, and TNXB genes by Porter et al..

The authors made the dataset publicly available in ArrayExpress with the Accession number: E-MTAB-7183.

The dataset consists of ocular tissue (pigmented layer of retina) from 25 AMD samples (21 level 2 AMD and 4 level 3 AMD) and 19 controls.

The age ranges from 50 years old to 89 years old, and the sex distribution is as follows: Male (27), Female (17).

#Explore metadata.
AMD_meta <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/E-MTAB-7183_meta.txt", sep = "\t", header = TRUE)
colnames(AMD_meta)[1] <- "Sample_Name"
colnames(AMD_meta)[3] <- "Sex"
colnames(AMD_meta)[4] <- "Age"
colnames(AMD_meta)[6] <- "Tissue"
colnames(AMD_meta)[7] <- "Disease_state"
colnames(AMD_meta)[17] <- "Assay_name"

AMD_meta$Sex <- gsub("male", "M", gsub("female", "F", AMD_meta$Sex)) #Changed for easier comparison with minfi sex prediction. 

#Create samplesheet.
AMD_samplesheet <- AMD_meta[, c("Sample_Name", "Disease_state", "Assay_name", "Sex", "Age", "Tissue")]
colnames(AMD_samplesheet)[2] <- "Sample_Group"
AMD_samplesheet$Assay_name <- gsub("_Grn", "", gsub("_Red", "", AMD_samplesheet$Assay_name))
AMD_samplesheet <- separate(AMD_samplesheet, col = "Assay_name", sep = "_", into = c("Sentrix_ID", "Sentrix_Position"))

duplicated(AMD_samplesheet) #The samplesheet contains duplicated samples. 
AMD_samplesheet <- AMD_samplesheet[duplicated(AMD_samplesheet),] #Remove duplicated samples. 
write.csv(AMD_samplesheet, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", row.names = FALSE)

Step 2: Create ExtendedRGSet object.

path = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project"
targets <- read.metharray.sheet(path) #Tells R to look for sample sheet within the folder. 
baseDir <- system.file(path, package = "minfiData") 
baseDir #baseDir determines the array chip position and ID.
sub(baseDir, "", targets$Basename) #The class of RGSet is an RGChannelSet object. This is the initial object of a minfi analysis that contains the raw intensities in the green and red channels. Note that this object contains the intensities of the internal control probes as well.

#Create Extended RGSet object.
AMD_ExtendedRGSet <- read.metharray.exp(targets = targets, extended = TRUE, verbose = TRUE)
AMD_ExtendedRGSet
sampleNames(AMD_ExtendedRGSet) == paste0(AMD_samplesheet$Sentrix_ID, "_", AMD_samplesheet$Sentrix_Position) #Check order of sampleNames in RGSet and samplesheet to change sampleNames in RGSet. Proceed ONLY if all is TRUE. 
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sampleNames(AMD_ExtendedRGSet) <- AMD_samplesheet$Sample_Name #Change sampleNames of AMD_ExtendedRGSet.
identical(sampleNames(AMD_ExtendedRGSet), rownames(pData(AMD_ExtendedRGSet))) #Confirm sample orders in pData.
## [1] TRUE
identical(sampleNames(AMD_ExtendedRGSet), colnames(getBeta(AMD_ExtendedRGSet))) #Confirm sample orders in beta matrix. 
## [1] TRUE
save(AMD_ExtendedRGSet, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")

Step 3: Create ewastools object.

#Create ewastools sampleInfo sheet.
AMD_sampleInfo <- AMD_meta[,c("Sample_Name", "Disease_state", "Assay_name", "Sex", "Age", "Tissue")]
AMD_sampleInfo$Assay_name <- gsub("_Red", "", gsub("_Grn", "", AMD_sampleInfo$Assay_name))
AMD_sampleInfo <- unique(AMD_sampleInfo) #Remove duplicates. 
path <- "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/"
AMD_sampleInfo$Assay_name <- paste0(path, AMD_sampleInfo$Assay_name)
meth <- read_idats(AMD_sampleInfo$Assay_name, quiet = TRUE)
## [1] 622399
rm(AMD_meta) #Remove original meta file. 
save(AMD_sampleInfo, meth, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ewastools_object.RData")

Quality Control - Sample Filtering

Step 1: Load objects.

load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ewastools_object.RData")

AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)

Step 2: Detection p-value.

It is worth checking the quality of the samples to see if they are true signals and are not conflated with background noise. One way to do so is by determining the detection p-values of the samples to parse out true methylation signals. Detection p-values by definition are measures that differentiate sample signal from background noise (which is estimated using the negative probes of the array). By default, the threshold for significant detection p-value is set at 0.01. Samples that are above the p-value threshold are considered statistically poor and should be removed (i.e. samples with high detection p-value should be discarded because they were detected with low signal-to-noise ratio of fluorescence intensities).

The minfi package provides a function, detectionP() to determine detection p-values for each methylation region across samples using the negative control probes in the array that are designed to NOT target the human genome. The probe sequences are propriety and are suggested to feature very low intensities (Heiss & Just (2019)).

NOTE: No samples above detection p-value of 0.01. All samples passed.

detp_minfi <- minfi::detectionP(AMD_ExtendedRGSet) #Detection p-value distinguishes signal from background noise with a single cut-off.
head(detp_minfi)[,1:5]
##                Sample 1    Sample 10    Sample 11    Sample 12    Sample 13
## cg00050873 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00212031 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00213748 6.491336e-46 7.633429e-54 3.984433e-32 4.143616e-56 1.782518e-25
## cg00214611 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00455876 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg01707559 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#Examine mean detection p-values across all samples to identify any failed samples.
plot(colMeans(detp_minfi), ylim = c(0.00002, 0.05), xaxt = 'n', ann = FALSE, pch = 20, col = "black", cex = 1) + 
  mtext(side = 1, line = 0.5, "Samples", font = 1, cex = 1) + 
  mtext(side = 2, line = 2, "Mean detection p-values", font = 1, cex = 1) + 
  abline(h = 0.01, col = "red") + 
  text(colMeans(detp_minfi), labels = AMD_ExtendedRGSet$Sample_Name, cex = 0.5, font = 2, pos = 2)

## integer(0)

Heiss and Just recommends using the non-specific fluorescence intensities to estimate the background signal, which is a more accurate and stringent method without needing to set an extreme detection p-value cut-off. They imply that this method not only “protect against false-positive findings” but “against false-negative findings as well”. They show that their method calls almost all Y-chromosome probes among males, but classifies most Y-chromosome probes in females as undetected using the 0.01 cut-off, which is the intended outcome as females possess XX chromosomes and should not have any signal detected in Y-chromosomes. They also show similar results detected using the negative control probes, but with a more extreme cut-off and more samples filtered (> 1e-40).

NOTE: No samples above detection p-value of 0.01. All samples passed.

detp_ewastools <- ewastools::detectionP(meth) #Detection p-value distinguishes signal from background noise with a single cut-off.
detp <- as.data.frame(detp_ewastools$detP)
head(detp)[,1:5]
##             V1            V2            V3            V4           V5
## 1 5.361168e-88  5.204288e-92 7.375519e-131 4.212671e-123 1.016425e-37
## 2 0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00 0.000000e+00
## 3 7.482801e-20  7.280083e-19  6.448259e-27  5.011001e-19 1.696454e-08
## 4 6.928902e-36  1.822559e-27  1.463483e-40  2.730796e-38 1.150689e-15
## 5 0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00 0.000000e+00
## 6 0.000000e+00 4.857922e-250  0.000000e+00  0.000000e+00 0.000000e+00
colnames(detp) <- colnames(getBeta(AMD_ExtendedRGSet))
rownames(detp) <- meth$manifest$probe_id

#Examine mean detection p-values across all samples to identify any failed samples.
plot(colMeans(detp, na.rm = TRUE), ylim = c(0.00002, 0.05), xaxt = 'n', ann = FALSE, pch = 20, col = "black", cex = 1) + 
  mtext(side = 1, line = 0.5, "Samples", font = 1, cex = 1) + 
  mtext(side = 2, line = 2, "Mean detection p-values", font = 1, cex = 1) + 
  abline(h = 0.01, col = "red") + 
  text(colMeans(detp, na.rm = TRUE), labels = colnames(detp), cex = 0.5, font = 2, pos = 2)

## integer(0)

Step 3: Beadcount.

For each probe sequence in the 450K array, a median of 14 beads is randomly distributed on the array. Each of these beads contains hundreds of thousands of oligonucleotides. This provides a unique set of internal technical replication on each array. This step is done to remove probes that are not represented by a minimum of 3 beads on the array, which is important to correct for positional effects (i.e. the effects where the same sample in different physical positions on the array could be measured as different methylation levels).

Probes that are not represented by a minimum of 3 beads on the array are designated as NA with the beadcount() function.

NOTE: All samples passed.

#Calculate the number of samples with bead count <3 for each probe in a matrix of bead count values.
bead <- beadcount(AMD_ExtendedRGSet)
colnames(bead) <- gsub('X', '', colnames(bead))

AMD_samplesheet <- AMD_samplesheet %>% mutate(Beadcount = colSums(is.na(bead)))

AMD_samplesheet %>% 
  mutate(Sample_Name = factor(as.character(Sample_Name), levels = Sample_Name)) %>%
  ggplot(aes(x = Sample_Name, y = Beadcount)) +
  geom_point(alpha = 0.7, color = 'black') + 
  geom_hline(yintercept = 0.01*nrow(bead), linetype = 'dashed', color = 'green') +
  geom_text(aes(x = 0, y = 0.01*nrow(bead)), 
            label = '1%', vjust = -0.5, hjust = -0.5, color = 'green')+
  scale_y_continuous(limits = c(0, 12500), breaks = seq(0, 12500, 2500)) +
  labs(x = 'Samples', y = '', title = '# Samples with probes with bead count < 3') +
  theme_classic() +
  theme(axis.text.x = element_blank()) 

save(detp_minfi, bead, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_detp_beadcount_QC.RData")

Step 4: Log median intensity of methylated and unmethylated channels.

minfi package provides a simple quality control plot that uses the log median intensity in both the methylated (M) and unmethylated (U) channels. When plotting these two medians against each other, it has been observed that good samples cluster together with higher median intensities, while failed samples tend to separate and have lower median intensities.

NOTE: All samples passed.

AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.
head(getMeth(AMD_MSet)[,1:3])
##            Sample 1 Sample 10 Sample 11
## cg00050873    13702     13026     14553
## cg00212031      459       753       391
## cg00213748     1442      1763      1505
## cg00214611      224       434       119
## cg00455876     3911      4834      5463
## cg01707559      795       802       759
head(getUnmeth(AMD_MSet)[,1:3])
##            Sample 1 Sample 10 Sample 11
## cg00050873     1821      2141      2001
## cg00212031     5111      5494      5486
## cg00213748      257       153       311
## cg00214611     5876      6057      5677
## cg00455876     1178      1320      1431
## cg01707559     7699      8679      8710
AMD_QC <- getQC(AMD_MSet)
head(AMD_QC)
## DataFrame with 6 rows and 2 columns
##                       mMed             uMed
##                  <numeric>        <numeric>
## Sample 1  10.6375305515253 11.3415188232452
## Sample 10 11.0188956211216 11.4361909954814
## Sample 11 11.1478408936723 11.5401280385821
## Sample 12 10.8811139606751 11.4782640315817
## Sample 13 10.8049376720519 11.2801907922169
## Sample 14 10.9708249013626 11.3426302986784
plotQC(AMD_QC)

Step 5: Control metrics.

Quality control metrics are examined to determine the success of the bisulphite conversion and subsequent array hybridisation. This check uses the Illumina’s 636 control probes to assess technical parameters including array staining, extension, hybridization, target removal, specificity, and bisulfite conversion.

NOTE: All samples passed control metrics.

ctrls <- control_metrics(meth)
# A logical vector of passed/failed is returned by sample_failure() which compares all 17 metrics against the thresholds recommended by Illumina.

AMD_sampleInfo$failed <- as.data.frame(sample_failure(ctrls))
table(AMD_sampleInfo$failed) #If AMD_sampleInfo$failed == FALSE, all samples PASS.
## 
## FALSE 
##    44
failed_control_metrics <- AMD_sampleInfo[AMD_sampleInfo$failed == TRUE, "Sample_Name"] #Check the sample names for the ones that fail control metrics.
failed_control_metrics #No failed samples. 
## factor(0)
## 44 Levels: Sample 1 Sample 10 Sample 11 Sample 12 Sample 13 ... Sample 9
#Summary of control metrics.
control_metrics_all <- as.data.frame(ctrls)
control_metrics_all$Sample_Name <- AMD_sampleInfo$Sample_Name
head(control_metrics_all)
##   Restoration Staining.Green Staining.Red Extension.Green Extension.Red
## 1  0.03074534       165.6038     18.81818        62.96364      13.23746
## 2  0.04477612       128.6500           NA        59.72438      13.03802
## 3  0.04687976             NA           NA        65.51228      12.58561
## 4  0.04408138       110.1176    259.83333        58.45902      13.48349
## 5  0.05071562       761.2381    224.86667        76.97196      11.92087
## 6  0.06167536       896.2500           NA        63.78764      11.68353
##   Hybridization.High.Medium Hybridization.Medium.Low Target.Removal.1
## 1                  1.680178                 2.077765         17.88889
## 2                  1.663601                 2.134143         13.07968
## 3                  1.581086                 2.200758         12.78210
## 4                  1.577832                 2.034192         13.29508
## 5                  1.593910                 2.112461         15.30476
## 6                  1.564947                 2.074236         12.68093
##   Target.Removal.2 Bisulfite.Conversion.I.Green Bisulfite.Conversion.I.Red
## 1        13.089431                     19.43561                   7.605697
## 2        12.388679                     19.55769                   6.784644
## 3         9.690265                     21.18272                   6.929114
## 4        11.544484                     21.24806                   8.676753
## 5        10.713333                     22.11200                  10.745140
## 6         8.377892                     19.78571                   6.975317
##   Bisulfite.Conversion.II Specificity.I.Green Specificity.I.Red Specificity.II
## 1                9.193029           10.068966          4.773639       27.63091
## 2               10.328638           10.681818          4.826513       28.79827
## 3               11.394419           12.701695          4.674271       27.03368
## 4               15.401290            9.255245          3.654054       32.10127
## 5                5.925329           11.415385          3.245431       25.92169
## 6               10.635226            7.774929          3.172968       27.81843
##   Non.polymorphic.Green Non.polymorphic.Red Sample_Name
## 1              8.395745            9.221387    Sample 1
## 2              9.231855           10.183274   Sample 10
## 3             10.406780           10.298646   Sample 11
## 4              8.893971           10.983957   Sample 12
## 5              9.436782            9.711957   Sample 13
## 6              8.233456            9.702459   Sample 14
sapply(control_metrics_all, function(x) sum(is.na(x)))
##                  Restoration               Staining.Green 
##                            0                           15 
##                 Staining.Red              Extension.Green 
##                           25                            0 
##                Extension.Red    Hybridization.High.Medium 
##                            0                            0 
##     Hybridization.Medium.Low             Target.Removal.1 
##                            0                            0 
##             Target.Removal.2 Bisulfite.Conversion.I.Green 
##                            0                            0 
##   Bisulfite.Conversion.I.Red      Bisulfite.Conversion.II 
##                            0                            0 
##          Specificity.I.Green            Specificity.I.Red 
##                            0                            0 
##               Specificity.II        Non.polymorphic.Green 
##                            0                            0 
##          Non.polymorphic.Red                  Sample_Name 
##                            0                            0
stripchart(ctrls$`Bisulfite Conversion II`, method = "jitter", pch = 4, xlab = 'Bisulfite Conversion II', xlim = c(0,20)) + 
  abline(v = 1, col = 2, lty = 3) +
  text(ctrls$`Bisulfite Conversion II`[ctrls$`Bisulfite Conversion II` < 1], 1.2, labels = ctrls$Sample_Name[ctrls$`Bisulfite Conversion II` < 1], srt = 45)

## integer(0)

or

controlStripPlot(AMD_ExtendedRGSet, controls = c("BISULFITE CONVERSION I", "BISULFITE CONVERSION II"))

Step 6: Contamination check.

The 450K BeadChip also features 65 control probes which assay highly-polymorphic single nucleotide polymorphisms (SNPs) rather than DNA methylation. These are included on the array to allow sample quality control to check for relatedness between individuals and enable the detection of potential sample mix-ups. The signal from these probes is expected to cluster into three distinct groups (representing the heterozygous and two homozygous groups). The snp_outliers function however computes the average log odds from the 65 posterior probabilities from a mixture model to capture how irregular the SNP betas are, i.e. how much they deviate from the ideal trimodal distribution. Although these are not DNA methylation signals, they could be used to provide an indication of the degree of technical variance between samples.

NOTE: All samples passed.

#While ewastools implements the LOESS normalization (Heiss and Brenner, 2015), the developers of the package says not use the normalization "as it does little to protect against batch effects but can result in the removal of genuine biological signal". They recommend to adjust for relevant technical covariates in regression models later.

beta <- dont_normalize(meth)

#Pulling SNP probes.
snps <- meth$manifest[probe_type == "rs", index]
snps <- beta[snps,]

#These SNPs are then used as input for call_genotypes(). This function estimates the parameters of a mixed model consisting of three beta distributions representing one heterozygous and the two homozygous genotypes. There is also a fourth component, shown as a uniform distribution that represents outliers. The functions returns posterior probabilities used for soft classification. 
#In simple words, we are determining the probability for every SNP whether they belong to 1 of 4 different distributions - 3 of which correspond to the 3 expected genotypes (AA, AB, BB), and the 4th distribution corresponds to outside/in-between these expected genotype distributions. SNPs with a higher probability of belonging to this 4th distribution indicate mixing between more than one genotype.

#Fit mixed model to call genotypes.
genotypes_called <- call_genotypes(snps, learn = T)

#Call genotype clusters.
AMD_samplesheet <- AMD_samplesheet %>% 
  mutate(genotype_cluster = as.factor(enumerate_sample_donors(genotypes_called)))

#Examine probability outlier.
plot(snp_outliers(genotypes_called) %>% sort, ylab = "SNP Outliers")

#Overall distribution of the genotypes.
ewastools:::mxm_(genotypes_called)

We see 3 peaks, corresponding to 3 possible genotypes.

#Check the average probability of SNP not belonging to any of the 3 genotypes (coloured by Sex).
AMD_samplesheet <- AMD_samplesheet %>% 
  mutate(Prob_SNP_outlier = colMeans(genotypes_called$outliers, na.rm = T),
         Prob_SNP_outlier_Logodds = snp_outliers(genotypes_called))
  
  ggplot(AMD_samplesheet, aes(x = Sample_Name, y = Prob_SNP_outlier, fill = Sex)) +
  geom_point(shape = 21, size = 2.5, alpha = 0.8, col = "black") + 
  scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
  labs(x = 'Samples', y = "Probability", 
       title = 'Average probability of SNP being an outlier') +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 15, vjust = -0.3),
        axis.title.y = element_text(size = 15, vjust = 2),
        legend.position = "none") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.1)) +
  scale_x_discrete(breaks = NULL, expand = c(0.02, 0.02))

The Y-axis denotes the average probability of SNP being an outlier. No sample appears to be > 0.1 probability, indicating that there is minimal probability of the presence of outlier.

#Look at the raw distribution
snp_betas <- getSnpBeta(AMD_ExtendedRGSet)
snp_betas_melt <- t(snp_betas) %>% as_tibble %>% mutate(Sample_Name = colnames(snp_betas)) %>%
  left_join(AMD_samplesheet %>% select(Sample_Name, Sex), by = 'Sample_Name') %>%
  gather(key = 'SNP', value = 'Beta', -Sample_Name, -Sex)
## Warning: Column `Sample_Name` joining character vector and factor, coercing into
## character vector
ggplot(snp_betas_melt, aes(x = SNP, y = Beta, fill = Sex)) +
  geom_point(shape = 21, size = 2.5, alpha = 0.8, col = "black") + 
  scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
  labs(x = '59 SNPs') + theme_bw() + theme(axis.text.x = element_blank()) +
  theme_bw() +
  theme(axis.text.x = element_blank()) +
  scale_x_discrete(breaks = NULL, expand = c(0.02, 0.02))

Step 7: Data linkage errors and Sample relations detection.

omicsPrint (Van Iterson et al. 2018) is a package developed to detect data linkage errors through inspecting sample relations in multiple omics studies. Included with the package is the hm450.manifest.pop.GoNL data, which stores SNP probe information in a GRanges class object. This is then used to create a subset of the beta values for genotyping. The function beta2genotype() then genotypes the observations by measuring homozygous or heterozygous alleles at these SNP probes. Lastly alleleSharing() assesses the relationships between different individuals, which can be unrelated, twins, or identical. The results can then be visualised using the inferRelations() function. In the data with sample relationships, this would be shown in the above graph as green or black clusters (Van Iterson et al. 2018). It is important to carry out this type of visualization before probe-filtering as otherwise the genotyping will be based on very few SNPs.

NOTE: No mismatches found.

data(hm450.manifest.pop.GoNL)

betas <- getBeta(AMD_ExtendedRGSet)

cpgs <- names(hm450.manifest.pop.GoNL[mcols(hm450.manifest.pop.GoNL)$MASK.snp5.EAS])
cpgs <- na.omit(match(cpgs, rownames(betas)))
omicsBetas <- betas[cpgs,]
omicsBetas[1:10, 1:2]
##              Sample 1  Sample 10
## cg08477687 0.15064562 0.57708628
## cg00645010 0.05583976 0.06016167
## cg11422233 0.07789995 0.07780564
## cg06402284 0.04116505 0.07233649
## cg01551879 0.04928504 0.05166097
## cg20788133 0.66539440 0.68896612
## cg09139287 0.49239130 0.31366692
## cg23100540 0.17080388 0.13082796
## cg23999112 0.70067114 0.74516195
## cg01062849 0.82885375 0.78417470
dnamCalls <- beta2genotype(omicsBetas, assayName = "exprs")
dim(dnamCalls)
## [1] 597  44
dnamCalls[1:10, 1:2]
##            Sample 1 Sample 10
## cg19405842        2         1
## cg01296877        3         1
## cg21783012        1         2
## cg26422465        1         2
## cg00345083        3         2
## cg15075357        2         3
## cg18285337        2         3
## cg02890259        2         3
## cg15600437        2         2
## cg25593194        3         2
omicsData <- alleleSharing(dnamCalls, verbose = TRUE)
## Hash relations
## Pruning 597 SNPs ...
## 0 SNPs removed because of low call rate!
## 0 samples removed because too few SNPs called!
## Using 597 polymorphic SNPs to determine allele sharing.
## Running `square` IBS algorithm!
## 45 of 990 (4.55%) ...
mismatches <- inferRelations(omicsData)

dim(mismatches)
## [1] 0 6

Step 8: Sample identity.

Hierarchical clustergram across all samples cluster similar samples together while samples that are different from all the other samples are pulled down as outliers.

NOTE: No obvious sample that is pulled down as outlier.

#Use SNP probes to infer identity and see how they cluster. 
snp_betas <- getSnpBeta(AMD_ExtendedRGSet)

identity_dendo <- dist(t(snp_betas))
clust <- hclust(identity_dendo)
dendo <- as.dendrogram(clust)   

dendo %>% dendextend::set("labels_cex", 0.6) %>% 
  hang.dendrogram %>% plot()

Step 9: Outlier detection.

The outlyx function takes any beta matrix (preferably raw) and will identify any samples that are inconsistent with the rest of the data. From the plot, we can observe that any data points that fall into the red squares are indeed outlying and should be removed from analysis.

To confirm the previous identity dendrogram with no obvious outliers, I run the chunk below to see if there are any sample that falls within the two red squares in the plot.

NOTE: No samples fall into the red squares, so no outlier confirmed.

betas <- getBeta(AMD_ExtendedRGSet)
detout <- outlyx(betas)

detout$Sample_Name <- rownames(detout)
detout[which(detout$outliers == T),]
## [1] iqr         mv          outliers    Sample_Name
## <0 rows> (or 0-length row.names)

Step 10: Sex check.

minfi has a sex predictor function that uses the median values of measurements on the X and Y chromosomes respectively. If yMed - xMed is less than cutoff of -2, the sample is predicted as female; otherwise it is predicted as male.

#Sex prediction using minfi's getSex() function. 
AMD_gRSet <- mapToGenome(AMD_MSet) #Convert to GenomicMethylSet object. 
predSex <- getSex(AMD_gRSet)
head(predSex)
## DataFrame with 6 rows and 3 columns
##                       xMed             yMed predictedSex
##                  <numeric>        <numeric>  <character>
## Sample 1    12.08015127114 12.2494097548417            M
## Sample 10 12.2550285396274 12.4141363144996            M
## Sample 11 12.3826240265749 12.5400006922142            M
## Sample 12 12.2884335395868 12.4617801274941            M
## Sample 13 12.1611318829843 12.3721820668771            M
## Sample 14 12.8814007851121 8.92777492058433            F
#Compare to Sex from metadata. 
AMD_samplesheet$Sex <- as.factor(AMD_samplesheet$Sex)
predSex$predictedSex <- as.factor(predSex$predictedSex)
all.equal(AMD_samplesheet$Sex, predSex$predictedSex) #TRUE.
## [1] TRUE
#Plot predicted sex against reported sex from metadata.
predictedSex <- as.data.frame(predSex)
ggplot(predictedSex, aes(x = xMed, y = yMed, fill = AMD_samplesheet$Sex)) +
  geom_point(shape = 21, size = 3, alpha = 0.8, col = "black") +
  scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 15, vjust = -0.3),
        axis.title.y = element_text(size = 15, vjust = 2),
        legend.position = "none")

The sex of the samples cluster accordingly.

ewastools has a similar sex predictor function that computes for each sample the average total intensities of all probes targeting either chromosome, X and Y respectively (There are 11,232 probes that target the X chromosome and 413 probes that target the Y chromosome). This function exploits the natural difference in allosomal (sex) copy number (with females having more copy number than males) and the fact that total intensity (U + M) is sensitive to copy number variation to detect sex mismatches. The threshold to discriminate between both sexes is determined by the Hodges-Lehmann estimator (i.e. median of all pairwise male/female averages) for X and Y chromosomes separately. The dotted lines in the figure below represent the Hodges-Lehman estimators separating the male and female cluster centres. The male samples should cluster in the top left quadrant while the female samples should cluster in the bottom right quadrant. The samples that fall in the top right and bottom left quadrants are considered “unclear” (Heiss & Just 2018).

It is suggested that this approach is more robust than minfi’s getSex() function due to its potential to detect sex mismatches and allosomal outliers.

#Sex prediction using ewastools' check_sex() function. 
predicted_sex <- check_sex(meth)
AMD_samplesheet <- AMD_samplesheet %>% mutate(normalized_X_intensity = predicted_sex$X,
                        normalized_Y_intensity = predicted_sex$Y)

#Sex plot.
ggplot(AMD_samplesheet, aes(x = normalized_X_intensity, y = normalized_Y_intensity, fill = Sex)) +
  geom_point(shape = 21, size = 3, alpha = 0.8, col = "black") + theme_classic() +
  scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
  geom_text_repel(data = AMD_samplesheet %>% filter(Sex == 'M', normalized_X_intensity > 0.95,
                                         normalized_Y_intensity < 0.5), 
                  aes(label = Sample_Name), size = 3, force = 15, nudge_x = -0.1, nudge_y = -0.1) +
  geom_hline(yintercept = 0.5, linetype = 'dashed', col = '#bd7b9f') + 
  geom_vline(xintercept = 0.95, linetype = 'dashed', col = '#2c7dab') +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 15, vjust = -0.3),
        axis.title.y = element_text(size = 15, vjust = 2),
        legend.position = "none")

This second check confirms no sex mismatches in the samples.

Step 11: Post sample filtering PCA.

Code from Nicole Gladish, Rachel Edgar and Sumaiya Islam.

Location: ~/KoborLab/kobor_space/shared_coding_resource/PCA Code.Rmd

betas <- getBeta(AMD_ExtendedRGSet) #Essentially use your mvalue matrix.
meta <- pData(AMD_ExtendedRGSet)
PCA_full <- princomp(na.omit(betas)) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)

colnames(meta) #Obtain the column numbers to include.
## [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
## [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
## [9] "filenames"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "meta...4."
colnames(meta_continuous) <- c("Age")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)


Normalisation

Step 1: Visualise pre-normalised distribution of beta values.

AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.
plotBetasByType(AMD_MSet[,1])

Step 2: Perform GenomeStudio normalisation.

#Perform background correction.
AMD_MSet.illumina <- preprocessIllumina(AMD_ExtendedRGSet, bg.correct = TRUE,
                               normalize = "control")

#Plot to visualise post-normalisation distribution. 
plotBetasByType(AMD_MSet.illumina[,1])

Step 3: Perform SWAN normalisation.

Subset-quantile within array normalization (SWAN) (Jovana Maksimovic, Lavinia Gordon, and Alicia Oshlack 2012) is a within-array normalization correction for the technical differences between the Type I and Type II array designs. The algorithm matches the beta-value distributions of the Type I and Type II probes by applying a within-array quantile normalization separately for different subsets of probes (divided by CpG content).

AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.

#SWAN uses a random subset of probes to do the between array normalization. In order to achive reproducible results, the seed needs to be set using set.seed.
set.seed(100)

#Perform SWAN normalisation.
AMD_MSet.swan <- preprocessSWAN(AMD_ExtendedRGSet, mSet = AMD_MSet)

#Plot to visualise post-normalisation distribution.
plotBetasByType(AMD_MSet.swan[,1])

save(AMD_MSet.swan, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.swan.RData")

Step 4: Perform quantile normalisation.

This function implements stratified quantile normalization preprocessing. The normalization procedure is applied to the Meth and Unmeth intensities separately. The distribution of type I and type II signals is forced to be the same by first quantile normalizing the type II probes across samples and then interpolating a reference distribution to which we normalize the type I probes. Since probe types and probe regions are confounded and we know that DNAm distributions vary across regions we stratify the probes by region before applying this interpolation. Note that this algorithm relies on the assumptions necessary for quantile normalization to be applicable and thus is not recommended for cases where global changes are expected such as in cancer-normal comparisons.

The different arguments for the function can be summarized into the following list:

If fixOutliers is TRUE, the functions fixes outliers of both the methylated and unmethylated channels when small intensities are close to zero.

If removeBadSamples is TRUE, it removes bad samples using the QC criterion discussed previously.

Performs stratified subset quantile normalization if quantileNormalize = TRUE and stratified = TRUE.

Predicts the sex (if not provided in the sex argument) using the function getSex and normalizes males and females separately for the probes on the X and Y chromosomes.

#Perform quantile normalisation.
AMD_gRSet.quantile <- preprocessQuantile(AMD_ExtendedRGSet, 
                                     fixOutliers = TRUE, 
                                     removeBadSamples = TRUE, badSampleCutoff = 10.5, 
                                     quantileNormalize = TRUE, stratified = TRUE, 
                                     mergeManifest = FALSE, sex = NULL)
## [preprocessQuantile] Mapping to genome.
## [preprocessQuantile] Fixing outliers.
## [preprocessQuantile] Quantile normalizing.
#Get manifest information for plot.
probeTypes <- data.frame(Name = featureNames(AMD_gRSet.quantile),
                         Type = getProbeType(AMD_gRSet.quantile))

#Plot to visualise post-normalisation distribution. 
betas.quantile <- getBeta(AMD_gRSet.quantile)
plotBetasByType(betas.quantile[,1], probeTypes = probeTypes)

Step 5: Perform noob normalisation.

#Perform noob normalisation. 
AMD_MSet.noob <- preprocessNoob(AMD_ExtendedRGSet)

#Plot to visualise post-normalisation distribution. 
plotBetasByType(AMD_MSet.noob[,1])

save(AMD_MSet.noob, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.noob.RData")

Step 6: Perform BMIQ normalisation.

BMIQ normalisation uses quantiles to normalise the Type II probe values into a distribution comparable to the Type I probes using a beta-mixture model fit ot the Type I and Type II probes separately and then transforms the probabilities of class membership of the Type II probes into quantiles of beta values using the parameters of the beta-distributions of the Type I distribution. This method uses a three-state beta-mixture model but does not use fit to the middle ‘hemi-methylated’ component in the normalisation, therefore it does not require a trimodal distribution. The advantage of BMIQ is that it avoids selecting subsets of probes matched for biological characteristics as done in SWAN and quantile normalisation, and was found to be the best algorithm for reducing probe design bias. However, it is also a more aggressive method of normalisation, so in the attempt to make the Type II probe values more similar to the Type I probe values, it may have inadvertently remove biologically meaningful values from the distributions.

#Perform BMIQ normalisation.
betas.BMIQnoob <- BMIQ(AMD_MSet.noob, nfit = 100000)

#Save BMIQ object. 
save(betas.BMIQnoob, file= "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/betas.BMIQnoob.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/betas.BMIQnoob.RData")

#Plot to visualise post-normalisation distribution.
#Download 450K manifest from shared coding space.
load("~/KoborLab/kobor_space/shared_coding_resource/HM450_fdat.RData")
dim(fData_450)
## [1] 485577     37
#Check if the CpGs correspond with the CpGs from the manifest.
manifest <- fData_450[rownames(fData_450)%in%rownames(betas.BMIQnoob),]
dim(manifest)
## [1] 485512     37
#Subset data based on probe types - Either Type 1 or Type 2. 
probe_types <- fData_450$INFINIUM_DESIGN_TYPE

#Subset Type 1 probe data. 
type_1 <- subset(fData_450, fData_450$INFINIUM_DESIGN_TYPE == "I")
dim(type_1)
## [1] 135501     37
#See how many Type 1 CpGs correspond with the Type 1 probe data.  
type_1betas <- betas.BMIQnoob[rownames(betas.BMIQnoob)%in%rownames(type_1),]
dim(type_1betas)
## [1] 135476     44
#Subset Type 2 probe data. 
type_2 <- subset(fData_450, fData_450$INFINIUM_DESIGN_TYPE == "II")
dim(type_2)
## [1] 350076     37
#See how many Type 2 CpGs correspond with the Type 2 probe data. 
type_2betas <- betas.BMIQnoob[rownames(betas.BMIQnoob)%in%rownames(type_2),]
dim(type_2betas)
## [1] 350036     44
#Plot BMIQ normalised data.
plot(c(0,1), c(0,10), xlab = "Beta values", ylab = "Density", main = "")
lines(density(na.omit(type_1betas)), col = "red", lty = 2, lwd = 2)
lines(density(na.omit(type_2betas)), col = "blue", lty = 1, lwd = 2)
legend("top", inset = 0.05, cex = 1.0, c("Type 1", "Type 2"), col = c("red", "blue"), lty = c(2,1), horiz = FALSE)

Step 7: Perform funnorm normalisation.

The function preprocessFunnorm implements the functional normalization algorithm developed in Jean-Philippe Fortin et al. 2014. Briefly, it uses the internal control probes present on the array to infer between-array technical variation. It is particularly useful for studies comparing conditions with known large-scale differences, such as cancer/normal studies, or between-tissue studies. It has been shown that for such studies, functional normalization outperforms other existing approaches (Jean-Philippe Fortin et al. 2014). By default, the function applies the preprocessNoob function as a first step for background substraction, and uses the first two principal components of the control probes to infer the unwanted variation.

#Perform funnorm normalisation.
AMD_gRSet.funnorm <- preprocessFunnorm(AMD_ExtendedRGSet)
## [preprocessFunnorm] Background and dye bias correction with noob
## [preprocessFunnorm] Mapping to genome
## [preprocessFunnorm] Quantile extraction
## [preprocessFunnorm] Normalization
#Get manifest information for plot.
probeTypes <- data.frame(Name = featureNames(AMD_gRSet.funnorm),
                         Type = getProbeType(AMD_gRSet.funnorm))

#Plot to visualise post-normalisation distribution. 
betas.funnorm <- getBeta(AMD_gRSet.funnorm)
plotBetasByType(betas.funnorm[,1], probeTypes = probeTypes)

save(AMD_gRSet.funnorm, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.RData")

Step 8: Post-normalisation PCA.

Here, I am performing PCA on funnorm normalised betas.

#Load the normalised beta matrix of choice.
betas <- betas.funnorm

meta <- pData(AMD_gRSet.funnorm)
PCA_full <- princomp(betas) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)

colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "meta...4."
colnames(meta_continuous) <- c("Age")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

Step 9: Sanity check.

#Get raw betas to compare.
betas.raw <- getBeta(AMD_ExtendedRGSet)

#Check sample order for sample-sample correlation.
identical(rownames(betas.raw), rownames(betas.funnorm)) #FALSE.
## [1] FALSE
identical(colnames(betas.raw), colnames(betas.funnorm)) #TRUE.
## [1] TRUE
#Reorder probe order.
betas.raw <- betas.raw[order(rownames(betas.raw)),]
betas.funnorm <- betas.funnorm[order(rownames(betas.funnorm)),]

#Recheck sample order for sample-sample correlation.
identical(rownames(betas.raw), rownames(betas.funnorm)) #TRUE.
## [1] TRUE
identical(colnames(betas.raw), colnames(betas.funnorm)) #TRUE.
## [1] TRUE
#Sample-sample correlation. 
cor.raw_funnorm <- cor(betas.raw, betas.funnorm, use = "pairwise.complete.obs") #Input: Beta values to compare. 
plot(xaxt = "n", ylim = c(0.95, 1), xlab = "", ylab = "", diag(cor.raw_funnorm), main = "Sample-sample correlation", pch = 16, panel.first = grid()) +
axis(1, 1:44, labels = rownames(cor.raw_funnorm), las = 3, cex.axis = 0.8)

## numeric(0)
#Difference in beta values before and after normalisation.
diff = betas.funnorm - betas.raw
hist(diff, cex.axis = 0.8, breaks = 100, xlab = "", ylab = "", xlim = c(-0.8, 0.8), las = 1, main = "Difference in beta values before and after normalisation")


Quality Control - Probe Filtering

Step 1: Sex probes removal.

Probes that target the sex chromosomes are removed as both males and females have unequal amount X chromosomes - with females having twice the amount of X chromosomes as compared to males. So they need to be removed so that the analysis is not skewed on the basis of this disproportionate natural difference in chromosomal number.

load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.RData")

data(IlluminaHumanMethylation450kmanifest)
data(Locations)

#Check probe count before removal.
dim(AMD_gRSet.funnorm) #485512 probes.
## [1] 485512     44
#Get sex probes information from manifest file.
sex_probes <- Locations[which(Locations$chr == "chrY" | Locations$chr == "chrX"), ] %>% as.data.frame() 
dim(sex_probes) #19627 probes.
## [1] 19627     3
#Check the number of sex probes in the beta matrix.
AMD_sex_probes <- AMD_gRSet.funnorm[which(rownames(AMD_gRSet.funnorm) %in% rownames(sex_probes)),]
dim(AMD_sex_probes) #10583 probes.
## [1] 10583    44
#Remove sex probes from AMD beta matrix.
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm)%in%rownames(sex_probes)),]
dim(AMD_gRSet.funnorm) #474929 probes remaining after sex probes filtering. 
## [1] 474929     44

Step 2: Cross-reactive and multi-mapped probes removal.

Non-specific and cross-hybridising probes are filtered according to the probes identified by Chen et al. and Langmead & Salzberg. There are probes that target multiple sites in the human genome and/or pose potential hybridisation issues, and thus result in inaccurate methylation signals detection.

Cross-reactive probes target highly repetitive sequences or co-hybridise to alternate sequences that are highly homologous to the intended targets, which could lead to the detection of spurious signals and potentially resulting in invalud conclusions and lack of validation in downstream analyses (Chen et al. 2013). Chen et al. (2013) proposes using the minimum number of bases matched to unintended targets of 47 bases for Infinium I and II probes to be used as criteria to identify cross-reactivity. They identified 8.4% of the Infinium I probes and 5.1% of the Infinium II probes (total: 6.0%) to be cross-reactive.

#Load Price annotation.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/fData_450_Price.RData")

#XY cross-hybridising probes.
XY_cross_hyb <- fData_450_Price[fData_450_Price$XY_Hits == "XY_YES",]$ILMNID
length(XY_cross_hyb) #12388 XY cross-hybridising probes.
## [1] 12388
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% XY_cross_hyb),] 
dim(AMD_gRSet.funnorm) #463426 probes after XY cross-hybridising probes removal.
## [1] 463426     44
#Autosomal cross-hybridising probes.
auto_cross_hyb <- fData_450_Price[fData_450_Price$Autosomal_Hits == "A_YES",]$ILMNID
length(auto_cross_hyb) #40650 autosomal cross-hybridising probes.
## [1] 40650
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% auto_cross_hyb),] 
dim(AMD_gRSet.funnorm) #434216 probes after autosomal cross-hybridising probes removal.
## [1] 434216     44
#Load cross-hybridising and multi-mapped probes.
#cross.react <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Non_specific_probes_450K.csv", header = TRUE, as.is = TRUE)
#cross.react.probes <- as.character(cross.react$TargetID)

#multi.map <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Cross_hybridising_probes_450K.txt", header = FALSE, as.is = TRUE)
#multi.map.probes <- as.character(multi.map$V1)

#Check for overlap between the two probe sets.
#filter.probes <- unique(c(cross.react.probes, multi.map.probes))
#length(filter.probes) #38941 probes.

#Remove cross-hybridising and multi-mapped probes. 
#table(rownames(betas.BMIQnoob) %in% filter.probes) #37760 cross-hybridising and multi-mapped probes in BMIQ-normalised beta matrix.
#betas.BMIQnoob <- betas.BMIQnoob[!(rownames(betas.BMIQnoob) %in% filter.probes),]
#dim(betas.BMIQnoob) #437169 probes remaining after cross-hybridising and multi-mapped probes are filtered.

Step 3: Polymorphic probes removal.

Polymorphic probes are probes that target CpG sites that overlap known SNPs. The methylation levels detected for such CpGs can be greatly influenced by the underlying genetic polymorphism - thus they should be interpreted with caution (Chen et al. 2013). By cross-matching the genomic positions of both C and G of all array-targeted CpGs and the position of single base extension (Infinium I) to that of known SNPs in the 1,000 Genome database, we found 9.4% of the Infinium I probes and 15.5% of the Infinium II probes (total probes: 13.8%) to have methylation levels that could potentially be affected by genetic polymorphism. Caveat: Although it is shown that methylation profile from the Illumina HM450 microarray could be greatly affected by genetic polymorphism, the majority of the SNPs are rare with very low alternative allele frequencies, thus they would not be expected to have a major effect on methylation data when the population under study does not demonstrate a significant frequency of the rare allele. Studies that are focused on intraindividual differences rather than interindividual differences (such as tumour/normal tisssue differences; longitudinal evaluation of methylation profiles; monozygotic twin studies) are not expected to be confounded by such underlying SNPs.

Pidsley et al. identify overlap with genetic variant categories with minor allele frequency > 5% at: (1) target CpG sites (n = 12,378); (2) single base extension sites of Type I probes (n = 772) using the EPIC array.

polymorphic_probes <- read.csv("~/KoborLab/kobor_space/kendrix/R_Functions/Pidsley_EPIC_polymorphic_probes.csv", header = TRUE)
polymorphic_probes <- as.character(polymorphic_probes$PROBE)
length(polymorphic_probes) #12510 polymorphic probes.
## [1] 12510
polymorphic_base_ext <- read.csv("~/KoborLab/kobor_space/kendrix/R_Functions/Pidsley_EPIC_single_base_ext_overlapping_SNP.csv", header = TRUE)
polymorphic_base_ext <- as.character(polymorphic_base_ext$PROBE)
length(polymorphic_base_ext) #414 probes that overlap with SNP at single base extension sites.
## [1] 414
#Get overlap between two probe sets.
polymorphic_probes_to_filter <- unique(c(polymorphic_probes, polymorphic_base_ext))
length(polymorphic_probes_to_filter) #12679 probes to filter. 
## [1] 12679
#Remove polymorphic probes.
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% polymorphic_probes_to_filter),] 
dim(AMD_gRSet.funnorm) #427786 probes after polymorphic probes removal.
## [1] 427786     44

Step 4: Bad detection p-value and low beadcount probes removal.

load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_detp_beadcount_QC.RData")
AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)

#Create a dummy matrix
bad_probes <- matrix(data = F, nrow = nrow(detp_minfi), ncol = ncol(detp_minfi),
                        dimnames = list(rownames(detp_minfi), colnames(detp_minfi))) %>% as.data.frame

#Designate TRUE to where beadcount < 3 and detection p-value > 0.01.
bad_probes[is.na(bead)] <- T
bad_probes[detp_minfi > 0.01] <- T

#Number of failed probes by samples.
AMD_samplesheet <- AMD_samplesheet %>% mutate(bad_probes = colSums(bad_probes))

#Check for samples that fail in more than 1% of total probes that have beadcount < 3 and detection p-value > 0.01.
AMD_samplesheet %>% 
  mutate(Sample_Name = factor(as.character(Sample_Name), levels = Sample_Name)) %>%
  ggplot(aes(x = Sample_Name, y = bad_probes)) +
  geom_point() +
  labs(x = 'Samples', y = '# probes with bead count < 3 or detection p > 0.01') +
  geom_hline(yintercept = 0.01*nrow(bad_probes), linetype = 'dashed', color = 'red') +
  geom_text(aes(x = 0, y = 0.01*nrow(bead)), label = '1%', vjust = -0.5, hjust = -0.5, color = 'red') +
  geom_label_repel(data = AMD_samplesheet %>% filter(bad_probes > 0.01*nrow(detp_minfi)),
                   aes(x = Sample_Name, y = bad_probes, label = Sample_Name),
                   force = 100, show.legend = F) +
  theme_classic() + theme(axis.text.x = element_blank()) 

#Check for probes that are unsuccesfully measured in nth% of samples.
n_samples <- ncol(AMD_gRSet.funnorm)
bad_probes_count <- rowSums(bad_probes)

#Check thresholds to determine number of probes to remove:
#Probes that are unsuccessfully measured in 1% of samples.
sum(bad_probes_count > 0.010*n_samples) #18814 probes.
## [1] 18814
#Probes that are unsuccessfully measured in 2.5% of samples.
sum(bad_probes_count > 0.025*n_samples) #5050 probes.
## [1] 5050
#Probes that are unsuccesfully measured in 5% of samples.
sum(bad_probes_count > 0.050*n_samples) #3318 probes.
## [1] 3318
#Remove probes that are unsuccessfully measured in 5% of samples.
remove_probes <- data.frame(probe_ID = rownames(detp_minfi)) %>% mutate(bad_probes_count = bad_probes_count) %>% filter(bad_probes_count > 0.05*n_samples)
AMD_gRSet.funnorm.filt <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% remove_probes$probe_ID),]
dim(AMD_gRSet.funnorm.filt) #425456 probes remaining after detection p-value and beadcount probe filtering. 
## [1] 425456     44

Step 5: Post probe filtering PCA.

#Load the probe filtered beta matrix.
betas.funnorm <- getBeta(AMD_gRSet.funnorm.filt)
meta <- pData(AMD_gRSet.funnorm.filt)

PCA_full <- princomp(betas.funnorm) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)

colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "meta...4."
colnames(meta_continuous) <- c("Age")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

Step 6: Save probe-filtered funnorm-normalised betas.

save(AMD_gRSet.funnorm.filt, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")

Cell Type Prediction

Step 1: EpiDISH.

library(EpiDISH)

#Load combat corrected data.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")

AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
betas.funnorm.filt <- getBeta(AMD_gRSet.funnorm.filt)

#Sanity checks.
sum(is.na(betas.funnorm.filt)) #0 NA.
## [1] 0
sum(is.infinite(betas.funnorm.filt)) #0 inf. values.
## [1] 0
#Load EpiDISH reference.
load("~/KoborLab/kobor_space/shared_coding_resource/EpiDish_Refs/centEpiFibIC.m.rda")
load("~/KoborLab/kobor_space/shared_coding_resource/EpiDish_Refs/centBloodSub.m.rda")

#Check to see if all the probes in the EpiDISH reference are in the combat corrected dataset.
all(rownames(centBloodSub.m) %in% rownames(betas.funnorm.filt)) #FALSE.
## [1] FALSE
dim(centBloodSub.m[which(!rownames(centBloodSub.m) %in% rownames(betas.funnorm.filt)),]) #10 probes in the reference not in the combat corrected dataset.
## [1] 10  7
all(rownames(centEpiFibIC.m) %in% rownames(betas.funnorm.filt)) #FALSE.
## [1] FALSE
dim(centEpiFibIC.m[which(!rownames(centEpiFibIC.m) %in% rownames(betas.funnorm.filt)),]) #27 probes in the reference not in the combat corrected dataset.
## [1] 27  3
AMD_Epi_counts <- as.data.frame(hepidish(betas.funnorm.filt, ref1.m = centEpiFibIC.m, ref2.m = centBloodSub.m, method = "RPC", h.CT.idx = 3))

#Plot EpiDISH cell type prediction. 
AMD_Epi.melt <- gather(AMD_Epi_counts)
colnames(AMD_Epi.melt)[1] <- "Cell_type"
colnames(AMD_Epi.melt)[2] <- "Estimated_proportions"

ggplot(AMD_Epi.melt, aes(factor(Cell_type), Estimated_proportions)) + 
  geom_boxplot(aes(fill = Cell_type), outlier.shape = NA) + 
  geom_jitter(aes(fill = Cell_type), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.7, jitter.width = 0.1)) + 
  labs(x = "Cell Type", y = "Estimated Proportions",
       caption = "Estimated cell type proportion using EpiDISH") +
  scale_fill_manual(values = c("#7F44AB", "#56B4E9", "#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#994C00", "#FFCCCC")) +
  theme_classic() + theme(plot.caption = element_text(hjust = 0.5, size = 12), legend.position = "none")

Step 2: Cell type PCA.

meta <- as.data.frame(AMD_pData)

#Check sample order to add cell type prediction to metadata.
identical(rownames(meta), rownames(AMD_Epi_counts)) #TRUE.
## [1] TRUE
#Add cell type predictions to metadata.
meta <- cbind(meta, AMD_Epi_counts)

M_values.funnorm.filt <- beta2m(betas.funnorm.filt)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$Epi <- as.numeric(meta$Epi)
meta$Fib <- as.numeric(meta$Fib)
meta$B <- as.numeric(meta$B)
meta$NK <- as.numeric(meta$NK)
meta$CD4T <- as.numeric(meta$CD4T)
meta$CD8T <- as.numeric(meta$CD8T)
meta$Mono <- as.numeric(meta$Mono)
meta$Neutro <- as.numeric(meta$Neutro)
meta$Eosino <- as.numeric(meta$Eosino)


colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"    
## [13] "Epi"              "Fib"              "B"                "NK"              
## [17] "CD4T"             "CD8T"             "Mono"             "Neutro"          
## [21] "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13,14,15,16,17,19,20,21)])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "Age"    "Epi"    "Fib"    "B"      "NK"     "CD4T"   "Mono"   "Neutro"
## [9] "Eosino"
colnames(meta_continuous) <- c("Age", "Epithelial", "Fibroblast", "B Cell", "NK Cell", "CD4T", "Monocyte", "Neutrophil", "Eosinophil")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

Step 3: Save objects.

save(AMD_Epi_counts, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EpiDISH_celltype_prediction.RData")

SVA

Surrogate variable analysis (SVA) (Jeffrey T Leek and John D Storey 2007, Jeffrey T Leek and John D Storey (2008)) is a useful tool to identified surrogate variables for unwanted variation while protecting for a phenotype of interest. In our experience, running SVA after normalizing the 450K data with preprocessFunnorm or preprocessQuantile increases the statistical power of the downstream analysis. For instance, to run SVA on the M-values, protecting for case-control status, the following code can be used to estimate the surrogate variables (this can take a few hours to run).

Input: Instead of using ComBat-corrected data to determine SVs, use probe-filtered data instead because Disease State may be confounded with Row, so using ComBat will wipe out the Disease State effects.

Step 1: SVA v1 - All variables: Age + Disease State + Sex + Row + Chip.

NOTE: 1 significant SV.

#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)

#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Sample_Group) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~as.factor(Sample_Group) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only. 
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.all <- sva(M_values.funnorm.filt, mod, mod0, n.sv) 
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables. 
#pprob.b = posterior probability that each CpG is associated with the variables of interest. 
#n.sv = the number of surrogate variables estimated by the sva. 

Step 2: SVA v2 - Confounding variables w.r.t Age: Age + Sex + Row + Chip.

NOTE: 1 significant SV.

#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)

#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only. 
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.con <- sva(M_values.funnorm.filt, mod, mod0, n.sv) 
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables. 
#pprob.b = posterior probability that each CpG is associated with the variables of interest. 
#n.sv = the number of surrogate variables estimated by the sva. 

Step 3: SVA v3 - Batch variables w.r.t Age: Age + Row + Chip.

NOTE: 1 significant SV.

#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)

#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only. 
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.batch <- sva(M_values.funnorm.filt, mod, mod0, n.sv) 
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables. 
#pprob.b = posterior probability that each CpG is associated with the variables of interest. 
#n.sv = the number of surrogate variables estimated by the sva. 

Step 4: SVA v4 - Only Age: Age.

NOTE: 1 significant SV.

#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)

#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ 1, data = AMD_pData) #Null model = adjustment variables only. 
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.
## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.age <- sva(M_values.funnorm.filt, mod, mod0, n.sv) 
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables. 
#pprob.b = posterior probability that each CpG is associated with the variables of interest. 
#n.sv = the number of surrogate variables estimated by the sva. 

Step 5: Check correlation between SVs.

SVs <- cbind(sva.results.all$sv, sva.results.con$sv)
SVs <- cbind(SVs, sva.results.batch$sv)
SVs <- cbind(SVs, sva.results.age$sv)
SVs <- as.data.frame(SVs)

colnames(SVs) <- c("SV_All", "SV_Confounders", "SV_Batch", "SV_Only_Age")
#Check for correlation between the SVs.
ggplot(SVs, aes(SV_All, SV_Batch)) + geom_point() + theme_classic()

ggplot(SVs, aes(SV_All, SV_Confounders)) + geom_point() + theme_classic()

ggplot(SVs, aes(SV_Batch, SV_Confounders)) + geom_point() + theme_classic()

ggplot(SVs, aes(SV_Only_Age, SV_All)) + geom_point() + theme_classic()

ggplot(SVs, aes(SV_Only_Age, SV_Batch)) + geom_point() + theme_classic()

ggplot(SVs, aes(SV_Only_Age, SV_Confounders)) + geom_point() + theme_classic()

Step 6: Add cell type prediction and SV to AMD_pData.

load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EpiDISH_celltype_prediction.RData")

#Add SV and cell type prediction to AMD_pData.
identical(rownames(AMD_pData), rownames(AMD_Epi_counts))
## [1] TRUE
AMD_pData <- cbind(AMD_pData, SVs$SV_All)
AMD_pData <- cbind(AMD_pData, AMD_Epi_counts)
colnames(AMD_pData)[13] <- "SV"

Step 7: SV + Cell Type PCA.

Only SV PCA.

meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)

colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"    
## [13] "SV"               "Epi"              "Fib"              "B"               
## [17] "NK"               "CD4T"             "CD8T"             "Mono"            
## [21] "Neutro"           "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13)])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "Age" "SV"
colnames(meta_continuous) <- c("Age", "SV")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

SV + Cell Type PCA.

meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)
meta$Epi <- as.numeric(meta$Epi)
meta$Fib <- as.numeric(meta$Fib)
meta$B <- as.numeric(meta$B)
meta$NK <- as.numeric(meta$NK)
meta$CD4T <- as.numeric(meta$CD4T)
meta$CD8T <- as.numeric(meta$CD8T)
meta$Mono <- as.numeric(meta$Mono)
meta$Neutro <- as.numeric(meta$Neutro)
meta$Eosino <- as.numeric(meta$Eosino)

colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Sample_Group"     "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"    
## [13] "SV"               "Epi"              "Fib"              "B"               
## [17] "NK"               "CD4T"             "CD8T"             "Mono"            
## [21] "Neutro"           "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13:18,20:22)])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Sample_Group"     "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)
##  [1] "Age"    "SV"     "Epi"    "Fib"    "B"      "NK"     "CD4T"   "Mono"  
##  [9] "Neutro" "Eosino"
colnames(meta_continuous) <- c("Age", "SV", "Epithelial", "Fibroblast", "B Cell", "NK Cell", "CD4T", "Monocyte", "Neutrophil", "Eosinophil")

Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

Step 8: Clean-up final pData object and save objects.

#Clean up column names of AMD_pData.
colnames(AMD_pData)[2] <- "Disease_State"
colnames(AMD_pData)[6:7] <- c("Row", "Chip")
colnames(AMD_pData)[14:22] <- c("Epithelial", "Fibroblast", "B_Cell", "NK_Cell", "CD4T", "CD8T", "Monocyte", "Neutrophil", "Eosinophil")

save(AMD_pData, M_values.funnorm.filt, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")

Correlation between variables

Step 1: Look at metadata heatmap.

Look for correlation between variables in the metadata. There might be a possibility that Age and Sex are confounded with Batch.

#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")

#Subset the biological and technical variables to check correlation.
colnames(AMD_pData) #Check columns to subset.
##  [1] "Sample_Name"   "Disease_State" "Sex"           "Age"          
##  [5] "Tissue"        "Row"           "Chip"          "Basename"     
##  [9] "filenames"     "xMed"          "yMed"          "predictedSex" 
## [13] "SV"            "Epithelial"    "Fibroblast"    "B_Cell"       
## [17] "NK_Cell"       "CD4T"          "CD8T"          "Monocyte"     
## [21] "Neutrophil"    "Eosinophil"
meta.cor <- AMD_pData[, c(2:4, 6:7, 13:18, 20:22)] #I didn't include tissue because they are all the same tissue. 

#Turn categorical variables into numerical variables.
meta.cor$Disease_State <- as.numeric(unlist(as.factor(meta.cor$Disease_State)))
meta.cor$Sex <- as.numeric((unlist(as.factor(meta.cor$Sex))))
meta.cor$Age <- as.numeric(meta.cor$Age)
meta.cor$Row <- as.numeric(unlist(as.factor(meta.cor$Row)))
meta.cor$Chip <- as.numeric(unlist(as.factor(meta.cor$Chip)))

str(meta.cor)
## 'data.frame':    44 obs. of  14 variables:
##  $ Disease_State: num  2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : num  2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Row          : num  3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : num  2 2 2 4 4 4 4 4 4 4 ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Metadata correlation.
colnames(meta.cor) #Choose variables to check.
##  [1] "Disease_State" "Sex"           "Age"           "Row"          
##  [5] "Chip"          "SV"            "Epithelial"    "Fibroblast"   
##  [9] "B_Cell"        "NK_Cell"       "CD4T"          "Monocyte"     
## [13] "Neutrophil"    "Eosinophil"
meta.SV.celltype <- meta.cor[, c(6:14)]

grey <- colorRampPalette(brewer.pal(n = 9, "BrBG"))
heatmap.2(cor(meta.SV.celltype, use = "pairwise.complete.obs", method = "spearman"), cexCol = 1.5, cexRow = 1.5, col = grey, dendrogram = "both", scale = "none", margins = c(8,8), trace = "none", keysize = 1.3)

#Metadata correlation.
colnames(meta.cor) #Choose variables to check.
##  [1] "Disease_State" "Sex"           "Age"           "Row"          
##  [5] "Chip"          "SV"            "Epithelial"    "Fibroblast"   
##  [9] "B_Cell"        "NK_Cell"       "CD4T"          "Monocyte"     
## [13] "Neutrophil"    "Eosinophil"
meta.all <- meta.cor[, c(1:7)]

grey <- colorRampPalette(brewer.pal(n = 9, "BrBG"))
heatmap.2(cor(meta.all, use = "pairwise.complete.obs", method = "spearman"), cexCol = 1.5, cexRow = 1.5, col = grey, dendrogram = "both", scale = "none", margins = c(10,10), trace = "none", keysize = 1.3)

Step 2: Look at final PCA.

meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))

#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.

colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"

meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Disease_State <- as.factor(meta$Disease_State)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)
meta$Epithelial <- as.numeric(meta$Epithelial)

colnames(meta) #Obtain the column numbers to include.
##  [1] "Sample_Name"      "Disease_State"    "Sex"              "Age"             
##  [5] "Tissue"           "Sentrix_Position" "Sentrix_ID"       "Basename"        
##  [9] "filenames"        "xMed"             "yMed"             "predictedSex"    
## [13] "SV"               "Epithelial"       "Fibroblast"       "B_Cell"          
## [17] "NK_Cell"          "CD4T"             "CD8T"             "Monocyte"        
## [21] "Neutrophil"       "Eosinophil"
meta_categorical <- data.frame(meta[, c(2,3,6,7)])  #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13,14)])  #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.
## [1] "Disease_State"    "Sex"              "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)
## [1] "Age"        "SV"         "Epithelial"
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.
#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)

Step 3: Plot Age that is confounded by Chip effect.

#Age is confounded by chip. 
ggplot(AMD_pData, aes(Chip, Age)) + 
  geom_boxplot() + 
  geom_jitter(aes(fill = Chip), position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Chip") + 
  scale_x_discrete(breaks = c("200723300084","200723300089","200723300090","200770460039"), labels = c("Chip084","Chip089","Chip090","Chip039")) + 
  theme_classic2() + theme(legend.position = "none")


EWAS Replication

Replicating Porter et al.’s hits using their model.

The chunks below are done in an attempt to replicate Porter et al.’s hits in their publication to check the integrity of the dataset.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Disease_State - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Porter_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Porter <- lm(Mval ~ Disease_State + Sex + Disease_State*Sex + Chip, data = meta) 
  coef(summary(mod_Porter))[2,4]}) #Returns nominal p-value for Disease State for model at each CpG.
head(Porter_LM_pval)

save(Porter_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Porter_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Porter_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Porter <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Porter_LM_pval)
ggplot(pvalue_dist_Porter, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Porter_LM_pval), max(Porter_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Porter_LM_pval, method = "fdr", n = length(Porter_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #1 at 0.2.
## [1]  1 44
#Looking at top hits by nominal P:
pvalue_dist_Porter <- pvalue_dist_Porter[order(pvalue_dist_Porter$Nominal_P),]
head(pvalue_dist_Porter)
##               CpG    Nominal_P
## 171012 cg07685601 3.761885e-07
## 152996 cg08744475 9.429167e-07
## 383678 cg19561908 2.657040e-06
## 386931 cg21116314 2.946301e-06
## 325595 cg16540262 3.344313e-06
## 232459 cg06113708 9.410994e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Porter[which(pvalue_dist_Porter$Nominal_P < 1e-6),]) #2 hits.
## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1] MAD1L1;MAD1L1;MAD1L1                     
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Porter_fixed <- deltabeta(as.matrix(betas.funnorm.filt), as.numeric(as.factor(AMD_pData$Disease_State)), 
                                     covar1 = as.numeric(as.factor(AMD_pData$Sex)), 
                                     covar2 = as.numeric(as.factor(AMD_pData$Disease_State))*as.numeric(as.factor(AMD_pData$Sex)),
                                     covar3 = as.numeric(as.factor(AMD_pData$Chip)))
length(delta_beta_Porter_fixed)
summary(delta_beta_Porter_fixed)

save(delta_beta_Porter_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Porter_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Porter_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Porter_Table <- data.frame(rownames(M_values.funnorm.filt), Porter_LM_pval, Multi_test_corr_relaxed, delta_beta_Porter_fixed)
colnames(Porter_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Porter_Table)), as.character(Porter_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Porter_Table[order(Porter_Table$Nominal_P),],10)
##                   CpG    Nominal_P       FDR  Delta_Beta
## cg07685601 cg07685601 3.761885e-07 0.1600517 -0.09740259
## cg08744475 cg08744475 9.429167e-07 0.2005848 -0.11530537
## cg19561908 cg19561908 2.657040e-06 0.2845716 -0.13768018
## cg21116314 cg21116314 2.946301e-06 0.2845716 -0.32142870
## cg16540262 cg16540262 3.344313e-06 0.2845716 -0.11951332
## cg06113708 cg06113708 9.410994e-06 0.6673273 -0.10939259
## cg20668447 cg20668447 1.244784e-05 0.7565727 -0.02217991
## cg24033103 cg24033103 1.466881e-05 0.7801166 -0.14165294
## cg02283787 cg02283787 2.191394e-05 0.9055315 -0.10045724
## cg09562539 cg09562539 2.708660e-05 0.9055315  0.04114078
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Porter_Table$Nominal_P, Porter_Table$Delta_Beta, 0.05, 1e-6, "DNAm changes", 0.3) #at 5e-6: 7 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 0"
## [1] "Hypomethylated: 2"
## Warning: Removed 130 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Porter_Table.annotated = merge(Porter_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Porter_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Porter_Table.annotated <- Porter_Table.annotated[order(Porter_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Porter_Hits <- Porter_Table.annotated[which(abs(Porter_Table.annotated$Delta_Beta)>0.05 & Porter_Table.annotated$Nominal_P<1e-6),]

#Let's order by Nominal_P:
LM_Porter_Hits = LM_Porter_Hits[order(LM_Porter_Hits$Nominal_P),]
rownames(LM_Porter_Hits) = c()
str(LM_Porter_Hits)
## 'data.frame':    2 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 131396 148247
##  $ Nominal_P         : num  3.76e-07 9.43e-07
##  $ FDR               : num  0.16 0.201
##  $ Delta_Beta        : num  -0.0974 -0.1153
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 21 20
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 3 2
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 33670 1
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4267 1
head(LM_Porter_Hits)
##          CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 1 cg07685601 3.761885e-07 0.1600517 -0.09740259          7          R
## 2 cg08744475 9.429167e-07 0.2005848 -0.11530537          6          F
##      UCSC_RefGene_Name UCSC_RefGene_Group
## 1 MAD1L1;MAD1L1;MAD1L1     Body;Body;Body
## 2

EWAS

EWAS Model 1: All Samples on Age only.

No covariates.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age <- lm(Mval ~ Age, data = meta) 
  coef(summary(mod_Age))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age_LM_pval)

save(Age_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age_LM_pval)
ggplot(pvalue_dist_Age, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age_LM_pval), max(Age_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age_LM_pval, method = "fdr", n = length(Age_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #1 at 0.05.
## [1]  1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #1 at 0.1.
## [1]  1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #8 at 0.2.
## [1]  8 44
#Looking at top hits by nominal P:
pvalue_dist_Age <- pvalue_dist_Age[order(pvalue_dist_Age$Nominal_P),]
head(pvalue_dist_Age)
##               CpG    Nominal_P
## 51300  cg22197050 9.782282e-08
## 347431 cg06596654 9.002000e-07
## 295662 cg02727104 1.154277e-06
## 50945  cg22166290 1.224729e-06
## 120455 cg09011833 1.648428e-06
## 112991 cg01594233 1.928795e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age[which(pvalue_dist_Age$Nominal_P < 1e-6),]) #2 hits.
## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1]              LOC100132215
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age)
length(delta_beta_Age_fixed)
summary(delta_beta_Age_fixed)

save(delta_beta_Age_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age_Table <- data.frame(rownames(M_values.funnorm.filt), Age_LM_pval, Multi_test_corr_relaxed, delta_beta_Age_fixed)
colnames(Age_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age_Table)), as.character(Age_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age_Table[order(Age_Table$Nominal_P),],10)
##                   CpG    Nominal_P        FDR Delta_Beta
## cg22197050 cg22197050 9.782282e-08 0.04161931 0.07334226
## cg06596654 cg06596654 9.002000e-07 0.12583304 0.19294123
## cg02727104 cg02727104 1.154277e-06 0.12583304 0.12586465
## cg22166290 cg22166290 1.224729e-06 0.12583304 0.06480999
## cg09011833 cg09011833 1.648428e-06 0.12583304 0.04786280
## cg01594233 cg01594233 1.928795e-06 0.12583304 0.11625353
## cg23813012 cg23813012 2.248637e-06 0.12583304 0.09040002
## cg06173889 cg06173889 2.366083e-06 0.12583304 0.21664766
## cg17786642 cg17786642 5.481186e-06 0.25911150 0.13094473
## cg15444387 cg15444387 6.873199e-06 0.29242438 0.08297815
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age_Table$Nominal_P, Age_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.3) #at 5e-6: 7 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 7"
## [1] "Hypomethylated: 0"
## Warning: Removed 236 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Age_Table.annotated = merge(Age_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age_Table.annotated <- Age_Table.annotated[order(Age_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age_Hits <- Age_Table.annotated[which(abs(Age_Table.annotated$Delta_Beta)>0.05 & Age_Table.annotated$Nominal_P<5e-6),]

#Let's order by Nominal_P:
LM_Age_Hits = LM_Age_Hits[order(LM_Age_Hits$Nominal_P),]
rownames(LM_Age_Hits) = c()
str(LM_Age_Hits)
## 'data.frame':    7 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 112794 48814 344702 28944 367394 106386
##  $ Nominal_P         : num  9.78e-08 9.00e-07 1.15e-06 1.22e-06 1.93e-06 ...
##  $ FDR               : num  0.0416 0.1258 0.1258 0.1258 0.1258 ...
##  $ Delta_Beta        : num  0.0733 0.1929 0.1259 0.0648 0.1163 ...
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 13 9 6 13 18 2 13
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 3 3 2 2 3 3 3
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1 1 4760 1 45910 54913
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1 1 151 1 5441 1209
head(LM_Age_Hits)
##          CpG    Nominal_P        FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 9.782282e-08 0.04161931 0.07334226          2          R
## 2 cg06596654 9.002000e-07 0.12583304 0.19294123         16          R
## 3 cg02727104 1.154277e-06 0.12583304 0.12586465         13          F
## 4 cg22166290 1.224729e-06 0.12583304 0.06480999          2          F
## 5 cg01594233 1.928795e-06 0.12583304 0.11625353          4          R
## 6 cg23813012 2.248637e-06 0.12583304 0.09040002          1          R
##                           UCSC_RefGene_Name
## 1                              LOC100132215
## 2                                          
## 3                                          
## 4 BCL11A;BCL11A;BCL11A;BCL11A;BCL11A;BCL11A
## 5                                          
## 6                                     PRDM2
##                          UCSC_RefGene_Group
## 1                                   TSS1500
## 2                                          
## 3                                          
## 4 1stExon;1stExon;1stExon;5'UTR;5'UTR;5'UTR
## 5                                          
## 6                                   TSS1500

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 14999.
##               CpG  Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 5818   cg24724428 0.01306187 0.8679507  0.13816266          6          F
## 10597  cg16867657 0.02498331 0.9073335  0.12001180          6          F
## 22215  cg01799681 0.05528434 0.9629861 -0.06722849          6          F
## 40061  cg21572722 0.10147615 0.9880938  0.06130403          6          F
## 52950  cg13562911 0.13527099 0.9993493  0.01746916          6          R
## 128106 cg25151806 0.32871854 0.9999614  0.01135130          6          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 5818              ELOVL2            TSS1500
## 10597             ELOVL2            TSS1500
## 22215             ELOVL2               Body
## 40061             ELOVL2            TSS1500
## 52950             ELOVL2               Body
## 128106            ELOVL2            TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 198718.
##              CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 87169 cg18964582  0.223704 0.9999614 0.01332839          1          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 87169           EDARADD            TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 6147.
##              CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 389   cg25431220 0.000539516 0.5571367 0.02272728         17          F
## 4659  cg07081054 0.010129245 0.8435414 0.05077271         17          R
## 17992 cg03870845 0.044346041 0.9501010 0.01825487         17          F
## 61481 cg10237252 0.157455931 0.9999614 0.08567139         17          R
## 93820 cg05265484 0.240881548 0.9999614 0.06522953         17          F
## 97185 cg12240603 0.249676189 0.9999614 0.02272064         17          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 389              TOM1L1             TSS200
## 4659             TOM1L1               Body
## 17992            TOM1L1             TSS200
## 61481            TOM1L1            TSS1500
## 93820            TOM1L1            TSS1500
## 97185            TOM1L1            TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 18625.
##              CpG  Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 8960  cg13878520 0.02083867 0.8954641 0.03776219          7          R
## 30407 cg08315202 0.07634766 0.9770872 0.04132384          7          R
## 31888 cg13314145 0.08037648 0.9807878 0.02325648          7          R
## 66889 cg13695954 0.17144570 0.9999614 0.01185541          7          F
## 71619 cg13585675 0.18350664 0.9999614 0.03665194          7          F
## 85391 cg17278447 0.21901598 0.9999614 0.04496371          7          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 8960              NPTX2               Body
## 30407             NPTX2            TSS1500
## 31888             NPTX2            TSS1500
## 66889             NPTX2               Body
## 71619             NPTX2               Body
## 85391             NPTX2               Body

EWAS Model 2: All samples on Age + SV.

Only SV (cell type) as covariate.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + SV.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.SV_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.SV <- lm(Mval ~ Age + SV, data = meta) #Only SV as covariate.
  coef(summary(mod_Age.SV))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.SV_LM_pval)

save(Age.SV_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age.SV <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.SV_LM_pval)
ggplot(pvalue_dist_Age.SV, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age.SV_LM_pval), max(Age.SV_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.SV_LM_pval, method = "fdr", n = length(Age.SV_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #5 at 0.05.
## [1]  5 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #18 at 0.1.
## [1] 18 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #133 at 0.2.
## [1] 133  44
#Looking at top hits by nominal P:
pvalue_dist_Age.SV <- pvalue_dist_Age.SV[order(pvalue_dist_Age.SV$Nominal_P),]
head(pvalue_dist_Age.SV)
##               CpG    Nominal_P
## 51300  cg22197050 7.223296e-08
## 100675 cg25359907 1.752215e-07
## 347431 cg06596654 2.082010e-07
## 49545  cg21875802 2.906234e-07
## 291484 cg05037876 3.405108e-07
## 295662 cg02727104 1.053368e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age.SV[which(pvalue_dist_Age.SV$Nominal_P < 1e-6),]) #5 hits.
## [1] 5 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1] LOC84740                               LOC100132215 SFRS8       
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age.SV_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$SV)
length(delta_beta_Age.SV_fixed)
summary(delta_beta_Age.SV_fixed)

save(delta_beta_Age.SV_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.SV_Table <- data.frame(rownames(M_values.funnorm.filt), Age.SV_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.SV_fixed)
colnames(Age.SV_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.SV_Table)), as.character(Age.SV_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.SV_Table[order(Age.SV_Table$Nominal_P),],10)
##                   CpG    Nominal_P        FDR Delta_Beta
## cg22197050 cg22197050 7.223296e-08 0.02897447 0.07358795
## cg25359907 cg25359907 1.752215e-07 0.02897447 0.04685236
## cg06596654 cg06596654 2.082010e-07 0.02897447 0.19414626
## cg21875802 cg21875802 2.906234e-07 0.02897447 0.15773330
## cg05037876 cg05037876 3.405108e-07 0.02897447 0.05090909
## cg02727104 cg02727104 1.053368e-06 0.06004712 0.12536605
## cg10074727 cg10074727 1.230206e-06 0.06004712 0.12122329
## cg09011833 cg09011833 1.471122e-06 0.06004712 0.04802012
## cg22166290 cg22166290 1.486331e-06 0.06004712 0.06491800
## cg23813012 cg23813012 1.659727e-06 0.06004712 0.08996610
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.SV_Table$Nominal_P, Age.SV_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.3) #at 5e-6: 17 Hypermethylated, 1 Hypomethylated
## [1] "Hypermethylated: 17"
## [1] "Hypomethylated: 1"
## Warning: Removed 230 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("/home/BCRICWH.LAN/ngladish/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
colnames(EPIC_Annotation_Complete)
##  [1] "Name"                                 
##  [2] "AddressA_ID"                          
##  [3] "AlleleA_ProbeSeq"                     
##  [4] "AddressB_ID"                          
##  [5] "AlleleB_ProbeSeq"                     
##  [6] "Infinium_Design_Type"                 
##  [7] "Next_Base"                            
##  [8] "Color_Channel"                        
##  [9] "Forward_Sequence"                     
## [10] "Genome_Build"                         
## [11] "CHR"                                  
## [12] "MAPINFO"                              
## [13] "SourceSeq"                            
## [14] "Strand"                               
## [15] "UCSC_RefGene_Name"                    
## [16] "UCSC_RefGene_Accession"               
## [17] "UCSC_RefGene_Group"                   
## [18] "UCSC_CpG_Islands_Name"                
## [19] "Relation_to_UCSC_CpG_Island"          
## [20] "Phantom4_Enhancers"                   
## [21] "Phantom5_Enhancers"                   
## [22] "DMR"                                  
## [23] "X450k_Enhancer"                       
## [24] "HMM_Island"                           
## [25] "Regulatory_Feature_Name"              
## [26] "Regulatory_Feature_Group"             
## [27] "GencodeBasicV12_NAME"                 
## [28] "GencodeBasicV12_Accession"            
## [29] "GencodeBasicV12_Group"                
## [30] "GencodeCompV12_NAME"                  
## [31] "GencodeCompV12_Accession"             
## [32] "GencodeCompV12_Group"                 
## [33] "DNase_Hypersensitivity_NAME"          
## [34] "DNase_Hypersensitivity_Evidence_Count"
## [35] "OpenChromatin_NAME"                   
## [36] "OpenChromatin_Evidence_Count"         
## [37] "TFBS_NAME"                            
## [38] "TFBS_Evidence_Count"                  
## [39] "Methyl27_Loci"                        
## [40] "Methyl450_Loci"                       
## [41] "Chromosome_36"                        
## [42] "Coordinate_36"                        
## [43] "SNP_ID"                               
## [44] "SNP_DISTANCE"                         
## [45] "SNP_MinorAlleleFrequency"             
## [46] "Random_Loci"                          
## [47] "X"                                    
## [48] "strand"                               
## [49] "Probe_rs"                             
## [50] "Probe_maf"                            
## [51] "CpG_rs"                               
## [52] "CpG_maf"                              
## [53] "SBE_rs"                               
## [54] "SBE_maf"                              
## [55] "CH_450_XY"                            
## [56] "CH_450_Aut"                           
## [57] "CH_EPIC"                              
## [58] "Cross_Hyb"
Age.SV_Table.annotated = merge(Age.SV_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.SV_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.SV_Table.annotated <- Age.SV_Table.annotated[order(Age.SV_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age.SV_Hits <- Age.SV_Table.annotated[which(abs(Age.SV_Table.annotated$Delta_Beta)>0.05 & Age.SV_Table.annotated$Nominal_P<5e-6),]

#Let's order by Nominal_P:
LM_Age.SV_Hits = LM_Age.SV_Hits[order(LM_Age.SV_Hits$Nominal_P),]
rownames(LM_Age.SV_Hits) = c()
str(LM_Age.SV_Hits)
## 'data.frame':    17 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 112794 340924 88163 48814 168455 344702 367394 106386 28944 ...
##  $ Nominal_P         : num  7.22e-08 2.08e-07 2.91e-07 3.41e-07 1.05e-06 ...
##  $ FDR               : num  0.029 0.029 0.029 0.029 0.06 ...
##  $ Delta_Beta        : num  0.0736 0.1941 0.1577 0.0509 0.1254 ...
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 13 9 13 5 6 20 13 2 13 18 ...
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 3 3 3 2 2 3 2 3 3 3 ...
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1 1 51815 1 21228 4760 45910 54913 1 ...
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1 1 3719 1 5441 151 5441 1209 1 ...
head(LM_Age.SV_Hits)
##          CpG    Nominal_P        FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 7.223296e-08 0.02897447 0.07358795          2          R
## 2 cg06596654 2.082010e-07 0.02897447 0.19414626         16          R
## 3 cg21875802 2.906234e-07 0.02897447 0.15773330          2          R
## 4 cg05037876 3.405108e-07 0.02897447 0.05090909         12          F
## 5 cg02727104 1.053368e-06 0.06004712 0.12536605         13          F
## 6 cg10074727 1.230206e-06 0.06004712 0.12122329          6          R
##   UCSC_RefGene_Name UCSC_RefGene_Group
## 1      LOC100132215            TSS1500
## 2                                     
## 3                                     
## 4             SFRS8               Body
## 5                                     
## 6              GCM2            TSS1500

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age.SV_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 78.
##               CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 72     cg16867657 3.463246e-05 0.1809725  0.12392410          6          F
## 396    cg24724428 2.750514e-04 0.2754662  0.14106900          6          F
## 4137   cg21572722 4.903703e-03 0.4655403  0.06381768          6          F
## 16853  cg01799681 2.590172e-02 0.5940401 -0.06551984          6          F
## 72273  cg13562911 1.398169e-01 0.7592642  0.01746062          6          R
## 143610 cg25151806 3.067914e-01 0.8427731  0.01099010          6          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 72                ELOVL2            TSS1500
## 396               ELOVL2            TSS1500
## 4137              ELOVL2            TSS1500
## 16853             ELOVL2               Body
## 72273             ELOVL2               Body
## 143610            ELOVL2            TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 92628.
##              CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 85570 cg18964582 0.1697889 0.7798321 0.01286814          1          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 85570           EDARADD            TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 836.
##              CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 778   cg25431220 0.000621525 0.3161219 0.02266178         17          F
## 6860  cg07081054 0.008901437 0.5071876 0.05113179         17          R
## 28131 cg03870845 0.047096992 0.6500937 0.01823156         17          F
## 48904 cg05265484 0.088672703 0.7092635 0.06107258         17          F
## 77772 cg10237252 0.152027983 0.7676020 0.08424283         17          R
## 99535 cg12240603 0.201335729 0.7957406 0.02151003         17          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 778              TOM1L1             TSS200
## 6860             TOM1L1               Body
## 28131            TOM1L1             TSS200
## 48904            TOM1L1            TSS1500
## 77772            TOM1L1            TSS1500
## 99535            TOM1L1            TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 3231.
##              CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 3006  cg02368096 0.003344363 0.4403836 0.07417906          7          R
## 10763 cg13878520 0.015165880 0.5479406 0.03723864          7          R
## 40150 cg08315202 0.070754743 0.6875276 0.04161135          7          R
## 44194 cg13314145 0.078951478 0.6980998 0.02343091          7          R
## 52529 cg13695954 0.096517852 0.7192038 0.01227917          7          F
## 55615 cg13585675 0.103227394 0.7268572 0.03538950          7          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 3006              NPTX2               Body
## 10763             NPTX2               Body
## 40150             NPTX2            TSS1500
## 44194             NPTX2            TSS1500
## 52529             NPTX2               Body
## 55615             NPTX2               Body

EWAS Model 3: All samples on Age + SV + Chip.

SV (cell type) and chip as covariates.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.SV.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.SV.chip <- lm(Mval ~ Age + SV + Chip, data = meta) #Only SV + Chip as covariate.
  coef(summary(mod_Age.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.SV.chip_LM_pval)

save(Age.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV.chip_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV.chip_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age.SV.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.SV.chip_LM_pval)
ggplot(pvalue_dist_Age.SV.chip, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age.SV.chip_LM_pval), max(Age.SV.chip_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Right-skewed. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.SV.chip_LM_pval, method = "fdr", n = length(Age.SV.chip_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #1 at 0.1.
## [1]  1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #12 at 0.2.
## [1] 12 44
#Looking at top hits by nominal P:
pvalue_dist_Age.SV.chip <- pvalue_dist_Age.SV.chip[order(pvalue_dist_Age.SV.chip$Nominal_P),]
head(pvalue_dist_Age.SV.chip)
##               CpG    Nominal_P
## 51300  cg22197050 1.494583e-07
## 49545  cg21875802 5.393880e-07
## 291484 cg05037876 1.510287e-06
## 100675 cg25359907 1.980075e-06
## 347431 cg06596654 2.143129e-06
## 176395 cg00361495 3.970641e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age.SV.chip[which(pvalue_dist_Age.SV.chip$Nominal_P < 1e-6),]) #2 hits.
## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1]              LOC100132215
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age.SV.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$SV, covar2 = AMD_pData$Chip)
length(delta_beta_Age.SV.chip_fixed)
summary(delta_beta_Age.SV.chip_fixed)

save(delta_beta_Age.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV.chip_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV.chip_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.SV.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.SV.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.SV.chip_fixed)
colnames(Age.SV.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.SV.chip_Table)), as.character(Age.SV.chip_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.SV.chip_Table[order(Age.SV.chip_Table$Nominal_P),],10)
##                   CpG    Nominal_P        FDR   Delta_Beta
## cg22197050 cg22197050 1.494583e-07 0.06358795  0.074437117
## cg21875802 cg21875802 5.393880e-07 0.11474293  0.154439604
## cg05037876 cg05037876 1.510287e-06 0.18236143  0.050491846
## cg25359907 cg25359907 1.980075e-06 0.18236143  0.043596572
## cg06596654 cg06596654 2.143129e-06 0.18236143  0.181927172
## cg00361495 cg00361495 3.970641e-06 0.19812786  0.167257219
## cg22166290 cg22166290 4.371720e-06 0.19812786  0.066983153
## cg22746333 cg22746333 4.424913e-06 0.19812786  0.032010515
## cg06173889 cg06173889 4.917622e-06 0.19812786  0.208709848
## cg14881459 cg14881459 5.347790e-06 0.19812786 -0.003442354
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.SV.chip_Table$Nominal_P, Age.SV.chip_Table$Delta_Beta, 0.05, 1e-6, "DNAm changes", 0.3) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 2"
## [1] "Hypomethylated: 0"
## Warning: Removed 291 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Age.SV.chip_Table.annotated = merge(Age.SV.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.SV.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.SV.chip_Table.annotated <- Age.SV.chip_Table.annotated[order(Age.SV.chip_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age.SV.chip_Hits <- Age.SV.chip_Table.annotated[which(abs(Age.SV.chip_Table.annotated$Delta_Beta)>0.05 & Age.SV.chip_Table.annotated$Nominal_P<1e-6),]

#Let's order by Nominal_P:
LM_Age.SV.chip_Hits = LM_Age.SV.chip_Hits[order(LM_Age.SV.chip_Hits$Nominal_P),]
rownames(LM_Age.SV.chip_Hits) = c()
str(LM_Age.SV.chip_Hits)
## 'data.frame':    2 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 340924
##  $ Nominal_P         : num  1.49e-07 5.39e-07
##  $ FDR               : num  0.0636 0.1147
##  $ Delta_Beta        : num  0.0744 0.1544
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 13 13
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 3 3
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1
head(LM_Age.SV.chip_Hits)
##          CpG    Nominal_P        FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 1.494583e-07 0.06358795 0.07443712          2          R
## 2 cg21875802 5.393880e-07 0.11474293 0.15443960          2          R
##   UCSC_RefGene_Name UCSC_RefGene_Group
## 1      LOC100132215            TSS1500
## 2

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age.SV.chip_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$CpG %in% c("cg16867657", "cg21572722", "cg24724428")),]) #Identified ELOVL2 sites.
##              CpG    Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 317   cg16867657 0.0004391873 0.5528210 0.09583086          6          F
## 1925  cg24724428 0.0030391951 0.6188314 0.10685771          6          F
## 13843 cg21572722 0.0265274260 0.7303255 0.05046642          6          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 317              ELOVL2            TSS1500
## 1925             ELOVL2            TSS1500
## 13843            ELOVL2            TSS1500
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$CpG %in% c("cg06639320", "cg22454769", "cg24079702")),]) #Identified FHL2 sites.
##               CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 13065  cg22454769 0.0248804 0.7252093 0.05827410          2          R
## 146874 cg06639320 0.3367485 0.9043558 0.02140123          2          R
##          UCSC_RefGene_Name         UCSC_RefGene_Group
## 13065  FHL2;FHL2;FHL2;FHL2 TSS200;TSS200;5'UTR;TSS200
## 146874 FHL2;FHL2;FHL2;FHL2 TSS200;TSS200;5'UTR;TSS200
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "DIP2C"),]) #First TOM1L1 hit == row 3937.
##             CpG    Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 132  cg19026125 0.0001882527 0.5405491 0.03074610         10          F
## 2649 cg04759187 0.0043212306 0.6329451 0.03315152         10          R
## 4980 cg19335436 0.0087526527 0.6716375 0.03228136         10          F
## 4993 cg04825276 0.0087756584 0.6716375 0.09008137         10          R
## 7664 cg06042849 0.0137971237 0.6817064 0.07309830         10          F
## 8686 cg16528305 0.0158812181 0.6900287 0.01924932         10          R
##      UCSC_RefGene_Name UCSC_RefGene_Group
## 132              DIP2C               Body
## 2649             DIP2C               Body
## 4980             DIP2C               Body
## 4993             DIP2C               Body
## 7664             DIP2C               Body
## 8686             DIP2C               Body
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 6776.
##              CpG  Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 6062  cg02368096 0.01073513 0.6731179  0.06982893          7          R
## 18378 cg13878520 0.03600149 0.7512299  0.03509352          7          R
## 27591 cg13585675 0.05554126 0.7794433  0.04179393          7          F
## 34876 cg13695954 0.07149678 0.7969352  0.01402217          7          F
## 54116 cg05168977 0.11518763 0.8320291 -0.04176249          7          F
## 63038 cg07666532 0.13566841 0.8430489  0.03103397          7          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 6062              NPTX2               Body
## 18378             NPTX2               Body
## 27591             NPTX2               Body
## 34876             NPTX2               Body
## 54116             NPTX2            1stExon
## 63038             NPTX2               Body
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "PENK"),]) #First NPTX2 hit == row 6776.
##               CpG  Nominal_P       FDR   Delta_Beta Chromosome Coordinate
## 8369   cg16419235 0.01523717 0.6874777  0.055805237          8          R
## 78619  cg16219603 0.17179867 0.8578390  0.024079036          8          R
## 95990  cg03650233 0.21240700 0.8699754  0.042896140          8          R
## 126250 cg11610346 0.28559578 0.8915285 -0.009300056          8          R
## 230382 cg16072688 0.55096302 0.9446666  0.012854411          8          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 8369                PENK            TSS1500
## 78619               PENK            TSS1500
## 95990               PENK            TSS1500
## 126250              PENK            TSS1500
## 230382              PENK            TSS1500

EWAS Model 4: All samples on Age + Sex + SV.

Sex and SV (cell type) as covariates.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + Sex + SV.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.Sex.SV_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.Sex.SV <- lm(Mval ~ Age + Sex + SV, data = meta) #Only Sex + SV as covariate.
  coef(summary(mod_Age.Sex.SV))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.Sex.SV_LM_pval)

save(Age.Sex.SV_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age.Sex.SV <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.Sex.SV_LM_pval)
ggplot(pvalue_dist_Age.Sex.SV, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age.Sex.SV_LM_pval), max(Age.Sex.SV_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.Sex.SV_LM_pval, method = "fdr", n = length(Age.Sex.SV_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #20 at 0.05.
## [1] 20 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #49 at 0.1.
## [1] 49 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #143 at 0.2.
## [1] 143  44
#Looking at top hits by nominal P:
pvalue_dist_Age.Sex.SV <- pvalue_dist_Age.Sex.SV[order(pvalue_dist_Age.Sex.SV$Nominal_P),]
head(pvalue_dist_Age.Sex.SV)
##               CpG    Nominal_P
## 100675 cg25359907 9.561427e-08
## 323108 cg19005438 3.161929e-07
## 395001 cg04253011 3.515988e-07
## 245266 cg15861585 6.388047e-07
## 51048  cg06528150 7.848523e-07
## 75729  cg15155209 8.500193e-07
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age.Sex.SV[which(pvalue_dist_Age.Sex.SV$Nominal_P < 1e-6),]) #8 hits.
## [1] 8 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1] C2CD4D;LOC100132111        DRD4                      
## [3] LOC84740                                             
## [5] KIAA1841;KIAA1841;KIAA1841 FOXB1                     
## [7] PLEKHG2                    SLC6A6;SLC6A6             
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age.Sex.SV_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Sex, covar2 = AMD_pData$SV)
length(delta_beta_Age.Sex.SV_fixed)
summary(delta_beta_Age.Sex.SV_fixed)

save(delta_beta_Age.Sex.SV_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.Sex.SV_Table <- data.frame(rownames(M_values.funnorm.filt), Age.Sex.SV_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.Sex.SV_fixed)
colnames(Age.Sex.SV_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.Sex.SV_Table)), as.character(Age.Sex.SV_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.Sex.SV_Table[order(Age.Sex.SV_Table$Nominal_P),],10)
##                   CpG    Nominal_P        FDR  Delta_Beta
## cg25359907 cg25359907 9.561427e-08 0.04067967  0.05135459
## cg19005438 cg19005438 3.161929e-07 0.04814805  0.15230396
## cg04253011 cg04253011 3.515988e-07 0.04814805 -0.12068960
## cg15861585 cg15861585 6.388047e-07 0.04814805  0.25150827
## cg06528150 cg06528150 7.848523e-07 0.04814805  0.04162110
## cg15155209 cg15155209 8.500193e-07 0.04814805 -0.18413729
## cg15015892 cg15015892 8.837434e-07 0.04814805  0.29120319
## cg02727104 cg02727104 9.491837e-07 0.04814805  0.13523762
## cg22197050 cg22197050 1.231075e-06 0.04814805  0.06811023
## cg24783211 cg24783211 1.309124e-06 0.04814805  0.22864438
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.Sex.SV_Table$Nominal_P, Age.Sex.SV_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 23"
## [1] "Hypomethylated: 5"
## Warning: Removed 2 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Age.Sex.SV_Table.annotated = merge(Age.Sex.SV_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.Sex.SV_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.Sex.SV_Table.annotated <- Age.Sex.SV_Table.annotated[order(Age.Sex.SV_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age.Sex.SV_Hits <- Age.Sex.SV_Table.annotated[which(abs(Age.Sex.SV_Table.annotated$Delta_Beta)>0.05 & Age.Sex.SV_Table.annotated$Nominal_P<5e-6),]

#Let's order by Nominal_P:
LM_Age.Sex.SV_Hits = LM_Age.Sex.SV_Hits[order(LM_Age.Sex.SV_Hits$Nominal_P),]
rownames(LM_Age.Sex.SV_Hits) = c()
str(LM_Age.Sex.SV_Hits)
## 'data.frame':    26 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 389441 301975 74991 256738 246988 244895 48814 345056 381395 88163 ...
##  $ Nominal_P         : num  9.56e-08 3.16e-07 3.52e-07 6.39e-07 8.50e-07 ...
##  $ FDR               : num  0.0407 0.0481 0.0481 0.0481 0.0481 ...
##  $ Delta_Beta        : num  0.0514 0.1523 -0.1207 0.2515 -0.1841 ...
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 18 8 12 4 17 2 6 13 18 5 ...
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 3 2 2 2 2 3 2 3 3 2 ...
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 32758 20278 44629 15936 53494 7376 1 30522 1 51815 ...
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 5441 3719 5441 4133 5126 1 5441 1 3719 ...
head(LM_Age.Sex.SV_Hits)
##          CpG    Nominal_P        FDR  Delta_Beta Chromosome Coordinate
## 1 cg25359907 9.561427e-08 0.04067967  0.05135459          4          R
## 2 cg19005438 3.161929e-07 0.04814805  0.15230396         15          F
## 3 cg04253011 3.515988e-07 0.04814805 -0.12068960         19          F
## 4 cg15861585 6.388047e-07 0.04814805  0.25150827         11          F
## 5 cg15155209 8.500193e-07 0.04814805 -0.18413729          3          F
## 6 cg15015892 8.837434e-07 0.04814805  0.29120319          1          R
##     UCSC_RefGene_Name UCSC_RefGene_Group
## 1            LOC84740            TSS1500
## 2               FOXB1            TSS1500
## 3             PLEKHG2               Body
## 4                DRD4            TSS1500
## 5       SLC6A6;SLC6A6          Body;Body
## 6 C2CD4D;LOC100132111        Body;TSS200

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age.Sex.SV_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.
##               CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 82     cg16867657 4.050478e-05 0.1909535  0.13208991          6          F
## 884    cg24724428 7.100348e-04 0.3174740  0.13746296          6          F
## 4921   cg21572722 5.900411e-03 0.4754659  0.06612837          6          F
## 9021   cg01799681 1.236828e-02 0.5416599 -0.07939168          6          F
## 104408 cg16323298 2.252035e-01 0.8520043 -0.02284845          6          F
## 110163 cg13562911 2.392363e-01 0.8579563  0.01397406          6          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 82                ELOVL2            TSS1500
## 884               ELOVL2            TSS1500
## 4921              ELOVL2            TSS1500
## 9021              ELOVL2               Body
## 104408            ELOVL2            TSS1500
## 110163            ELOVL2               Body
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.
##              CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 70116 cg18964582 0.1429912 0.8056483 0.01454488          1          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 70116           EDARADD            TSS1500
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.
##               CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 1195   cg25431220 0.001022328 0.3389639 0.02343953         17          F
## 5163   cg07081054 0.006257355 0.4796366 0.05648637         17          R
## 41557  cg03870845 0.077631573 0.7376456 0.01682909         17          F
## 60672  cg05265484 0.120857479 0.7869535 0.05997839         17          F
## 66315  cg10237252 0.133852023 0.7974528 0.09643585         17          R
## 148404 cg12240603 0.335573177 0.8936094 0.01685166         17          F
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 1195              TOM1L1             TSS200
## 5163              TOM1L1               Body
## 41557             TOM1L1             TSS200
## 60672             TOM1L1            TSS1500
## 66315             TOM1L1            TSS1500
## 148404            TOM1L1            TSS1500
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.
##              CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 6561  cg02368096 0.008382397 0.5050415 0.06860593          7          R
## 21042 cg13878520 0.034448045 0.6466652 0.03481473          7          R
## 25935 cg13314145 0.044192157 0.6732259 0.02981350          7          R
## 35336 cg08315202 0.063950142 0.7147905 0.04385453          7          R
## 40619 cg13585675 0.075444661 0.7334426 0.04452483          7          F
## 79584 cg13695954 0.165389642 0.8210634 0.01155458          7          F
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 6561              NPTX2               Body
## 21042             NPTX2               Body
## 25935             NPTX2            TSS1500
## 35336             NPTX2            TSS1500
## 40619             NPTX2               Body
## 79584             NPTX2               Body

EWAS Model 5: All samples on Age + Sex + SV + Chip.

Sex, SV (cell type) and chip as covariates.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + Sex + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.Sex.SV.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.Sex.SV.chip <- lm(Mval ~ Age + Sex + SV + Chip, data = meta) #Only Sex + SV + Chip as covariate.
  coef(summary(mod_Age.Sex.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.Sex.SV.chip_LM_pval)

save(Age.Sex.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV.chip_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV.chip_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age.Sex.SV.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.Sex.SV.chip_LM_pval)
ggplot(pvalue_dist_Age.Sex.SV.chip, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age.Sex.SV.chip_LM_pval), max(Age.Sex.SV.chip_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.Sex.SV.chip_LM_pval, method = "fdr", n = length(Age.Sex.SV.chip_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #20 at 0.05.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #49 at 0.1.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #143 at 0.2.
## [1] 24 44
#Looking at top hits by nominal P:
pvalue_dist_Age.Sex.SV.chip <- pvalue_dist_Age.Sex.SV.chip[order(pvalue_dist_Age.Sex.SV.chip$Nominal_P),]
head(pvalue_dist_Age.Sex.SV.chip)
##               CpG    Nominal_P
## 75729  cg15155209 5.422114e-07
## 201801 cg21132564 7.437805e-07
## 27468  cg24155129 8.366901e-07
## 296800 cg27375012 9.669248e-07
## 100675 cg25359907 1.527042e-06
## 296713 cg06968164 2.590083e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age.Sex.SV.chip[which(pvalue_dist_Age.Sex.SV.chip$Nominal_P < 1e-6),]) #8 hits.
## [1] 4 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1] SLC25A30      ANK1          SLC6A6;SLC6A6
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age.Sex.SV.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Sex, covar2 = AMD_pData$SV, covar3 = AMD_pData$Chip)
length(delta_beta_Age.Sex.SV.chip_fixed)
summary(delta_beta_Age.Sex.SV.chip_fixed)

save(delta_beta_Age.Sex.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV.chip_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV.chip_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.Sex.SV.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.Sex.SV.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.Sex.SV.chip_fixed)
colnames(Age.Sex.SV.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.Sex.SV.chip_Table)), as.character(Age.Sex.SV.chip_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.Sex.SV.chip_Table[order(Age.Sex.SV.chip_Table$Nominal_P),],10)
##                   CpG    Nominal_P       FDR  Delta_Beta
## cg15155209 cg15155209 5.422114e-07 0.1028460 -0.20354698
## cg21132564 cg21132564 7.437805e-07 0.1028460  0.08116935
## cg24155129 cg24155129 8.366901e-07 0.1028460  0.17814272
## cg27375012 cg27375012 9.669248e-07 0.1028460  0.02885347
## cg25359907 cg25359907 1.527042e-06 0.1299378  0.04918882
## cg06968164 cg06968164 2.590083e-06 0.1449706 -0.16283149
## cg22197050 cg22197050 2.707523e-06 0.1449706  0.07129775
## cg19005438 cg19005438 2.725933e-06 0.1449706  0.15603705
## cg06173889 cg06173889 3.857674e-06 0.1513159  0.23571242
## cg05800683 cg05800683 3.877357e-06 0.1513159  0.01597645
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.Sex.SV.chip_Table$Nominal_P, Age.Sex.SV.chip_Table$Delta_Beta, 0.05, 3e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 4"
## [1] "Hypomethylated: 2"
## Warning: Removed 1 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Age.Sex.SV.chip_Table.annotated = merge(Age.Sex.SV.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.Sex.SV.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.Sex.SV.chip_Table.annotated <- Age.Sex.SV.chip_Table.annotated[order(Age.Sex.SV.chip_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age.Sex.SV.chip_Hits <- Age.Sex.SV.chip_Table.annotated[which(abs(Age.Sex.SV.chip_Table.annotated$Delta_Beta)>0.05 & Age.Sex.SV.chip_Table.annotated$Nominal_P<4e-6),]

#Let's order by Nominal_P:
LM_Age.Sex.SV.chip_Hits = LM_Age.Sex.SV.chip_Hits[order(LM_Age.Sex.SV.chip_Hits$Nominal_P),]
rownames(LM_Age.Sex.SV.chip_Hits) = c()
str(LM_Age.Sex.SV.chip_Hits)
## 'data.frame':    6 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 246988 330979 119392 345056 301975 106386
##  $ Nominal_P         : num  5.42e-07 7.44e-07 2.59e-06 2.71e-06 2.73e-06 ...
##  $ FDR               : num  0.103 0.103 0.145 0.145 0.145 ...
##  $ Delta_Beta        : num  -0.2035 0.0812 -0.1628 0.0713 0.156 ...
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 17 22 6 13 8 13
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 2 2 2 3 2 3
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 53494 2154 1 30522 20278 54913
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4133 3719 1 5441 5441 1209
head(LM_Age.Sex.SV.chip_Hits)
##          CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 1 cg15155209 5.422114e-07 0.1028460 -0.20354698          3          F
## 2 cg21132564 7.437805e-07 0.1028460  0.08116935          8          F
## 3 cg06968164 2.590083e-06 0.1449706 -0.16283149         13          F
## 4 cg22197050 2.707523e-06 0.1449706  0.07129775          2          R
## 5 cg19005438 2.725933e-06 0.1449706  0.15603705         15          F
## 6 cg06173889 3.857674e-06 0.1513159  0.23571242          2          R
##   UCSC_RefGene_Name UCSC_RefGene_Group
## 1     SLC6A6;SLC6A6          Body;Body
## 2              ANK1               Body
## 3                                     
## 4      LOC100132215            TSS1500
## 5             FOXB1            TSS1500
## 6       SOX11;SOX11      3'UTR;1stExon

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age.Sex.SV.chip_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.
##               CpG    Nominal_P       FDR   Delta_Beta Chromosome Coordinate
## 622    cg16867657 0.0009175385 0.5735714  0.100948234          6          F
## 7374   cg24724428 0.0136224422 0.7290164  0.096426750          6          F
## 20216  cg21572722 0.0415317908 0.8107440  0.050285162          6          F
## 28088  cg01799681 0.0599461434 0.8413430 -0.065848888          6          F
## 56933  cg16323298 0.1298762295 0.9010245 -0.031395803          6          F
## 174463 cg25151806 0.4262825709 0.9660973  0.008674443          6          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 622               ELOVL2            TSS1500
## 7374              ELOVL2            TSS1500
## 20216             ELOVL2            TSS1500
## 28088             ELOVL2               Body
## 56933             ELOVL2            TSS1500
## 174463            ELOVL2            TSS1500
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.
##              CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 45040 cg18964582 0.1003107 0.8790123 0.01767369          1          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 45040           EDARADD            TSS1500
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.
##               CpG   Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 3425   cg07081054 0.005911721 0.6785229 0.06236799         17          R
## 7211   cg25431220 0.013323336 0.7282549 0.01828383         17          F
## 19934  cg05265484 0.040972118 0.8106590 0.08024987         17          F
## 35720  cg10237252 0.078017910 0.8608219 0.11466416         17          R
## 135225 cg12240603 0.326746589 0.9552553 0.02000998         17          F
## 157845 cg13913085 0.383748920 0.9612396 0.04980854         17          F
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 3425              TOM1L1               Body
## 7211              TOM1L1             TSS200
## 19934             TOM1L1            TSS1500
## 35720             TOM1L1            TSS1500
## 135225            TOM1L1            TSS1500
## 157845            TOM1L1               Body
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.
##              CpG  Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 12311 cg13585675 0.02404082 0.7713448  0.05686261          7          F
## 19405 cg02368096 0.03975083 0.8082895  0.05827515          7          R
## 35988 cg05168977 0.07864802 0.8616515 -0.05100356          7          F
## 52267 cg13878520 0.11817578 0.8921699  0.02835619          7          R
## 54246 cg13695954 0.12309024 0.8957100  0.01411725          7          F
## 67178 cg13314145 0.15524681 0.9128074  0.02251316          7          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 12311             NPTX2               Body
## 19405             NPTX2               Body
## 35988             NPTX2            1stExon
## 52267             NPTX2               Body
## 54246             NPTX2               Body
## 67178             NPTX2            TSS1500

EWAS Model 5.1: AMD samples on Age + Sex + SV + chip.

AMD samples only with sex, SV (cell type) and chip as covariates

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
#Subset AMD samples only.
AMD <- subset(AMD_pData, Disease_State == "age-related macular degeneration")
dim(AMD) #25 samples.
## [1] 25 22
AMD_M_values.funnorm.filt <- M_values.funnorm.filt[, colnames(M_values.funnorm.filt) %in% rownames(AMD)]
dim(AMD_M_values.funnorm.filt) #425456 probes, 25 samples.
## [1] 425456     25
rm(AMD_pData)
rm(M_values.funnorm.filt)
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + Sex + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
AMD.only_Age.Sex.SV.chip_LM_pval <- pbsapply(1:nrow(AMD_M_values.funnorm.filt), function(CpG){
  meta <- AMD
  meta$Mval <- AMD_M_values.funnorm.filt[CpG,]
  mod_AMD.only_Age.Sex.SV.chip <- lm(Mval ~ Age + Sex + SV + Chip, data = meta) #Only Sex + SV + Chip as covariate.
  coef(summary(mod_AMD.only_Age.Sex.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(AMD.only_Age.Sex.SV.chip_LM_pval)

save(AMD.only_Age.Sex.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD.only_Age.Sex.SV.chip_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD.only_Age.Sex.SV.chip_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_AMD.only_Age.Sex.SV.chip <- data.frame(CpG = rownames(AMD_M_values.funnorm.filt), Nominal_P = AMD.only_Age.Sex.SV.chip_LM_pval)
ggplot(pvalue_dist_AMD.only_Age.Sex.SV.chip, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(AMD.only_Age.Sex.SV.chip_LM_pval), max(AMD.only_Age.Sex.SV.chip_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
AMD_M_values.funnorm.filt <- as.data.frame(AMD_M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(AMD.only_Age.Sex.SV.chip_LM_pval, method = "fdr", n = length(AMD.only_Age.Sex.SV.chip_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.
## [1]  0 25
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.
## [1]  2 25
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #2 at 0.2.
## [1]  2 25
#Looking at top hits by nominal P:
pvalue_dist_AMD.only_Age.Sex.SV.chip <- pvalue_dist_AMD.only_Age.Sex.SV.chip[order(pvalue_dist_AMD.only_Age.Sex.SV.chip$Nominal_P),]
head(pvalue_dist_AMD.only_Age.Sex.SV.chip)
##               CpG    Nominal_P
## 206218 cg07306755 1.390008e-07
## 215414 cg14625731 4.676746e-07
## 296713 cg06968164 4.794239e-06
## 106709 cg26845297 6.235855e-06
## 27468  cg24155129 6.972155e-06
## 409036 cg06049791 7.078629e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_AMD.only_Age.Sex.SV.chip[which(pvalue_dist_AMD.only_Age.Sex.SV.chip$Nominal_P < 1e-6),]) #2 hits.
## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1]         GEM;GEM
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

AMD_betas.funnorm.filt <- m2beta(AMD_M_values.funnorm.filt)

delta_beta_AMD.only_Age.Sex.SV.chip_fixed <- deltabeta(as.matrix(AMD_betas.funnorm.filt), AMD$Age, covar1 = AMD$Sex, covar2 = AMD$SV, covar3 = AMD$Chip)
length(delta_beta_AMD.only_Age.Sex.SV.chip_fixed)
summary(delta_beta_AMD.only_Age.Sex.SV.chip_fixed)

save(delta_beta_AMD.only_Age.Sex.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_AMD.only_Age.Sex.SV.chip_fixed.RData")

EWAS Model 6: All samples on Age + Chip.

Chip as covariate.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : num  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
library(pbapply) #Progress bar for apply functions.

#EWAS on Age - All samples + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.chip <- lm(Mval ~ Age + Chip, data = meta) #Only Chip as covariate.
  coef(summary(mod_Age.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.chip_LM_pval)

save(Age.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.chip_LM_pval.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.chip_LM_pval.RData")

#Inspect p-value distribution for model.
pvalue_dist_Age.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.chip_LM_pval)
ggplot(pvalue_dist_Age.chip, aes(Nominal_P)) + 
  geom_histogram(fill = "grey90", color = "black") + 
  theme_classic() + xlab("Nominal P Value") + 
  ylim(0, 20000) + 
  xlim(min(Age.chip_LM_pval), max(Age.chip_LM_pval))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#Not right-skewed. Distribution is a little even. 

#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.chip_LM_pval, method = "fdr", n = length(Age.chip_LM_pval))

#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.
## [1]  0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #32 at 0.2.
## [1] 32 44
#Looking at top hits by nominal P:
pvalue_dist_Age.chip <- pvalue_dist_Age.chip[order(pvalue_dist_Age.chip$Nominal_P),]
head(pvalue_dist_Age.chip)
##               CpG    Nominal_P
## 75729  cg15155209 5.551609e-07
## 100675 cg25359907 1.027507e-06
## 296713 cg06968164 1.297871e-06
## 323108 cg19005438 1.445053e-06
## 43977  cg06173889 2.267861e-06
## 51300  cg22197050 2.610535e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
dim(hits_CpGs <- pvalue_dist_Age.chip[which(pvalue_dist_Age.chip$Nominal_P < 1e-6),]) #1 hits.
## [1] 1 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name
## [1] SLC6A6;SLC6A6
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
  # Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
  # mainvar should be a vector of continuous variable
  # all covars should also be vectors
  # df = dataframe or matrix of beta values
  # output is a vector of delta beta values
  sd=sd(mainvar)
  qt <-
    range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
  dB <- vector(mode = "numeric", length = nrow(df))
  names(dB) <- rownames(df)
  for (i in 1:nrow(df)) {
    beta <- df[i, ]
    if (is.null(covar1)) {
      mod <- lm(beta ~ mainvar)
    } else if (is.null(covar2)) {
      mod <- lm(beta ~ mainvar + covar1)
    } else if (is.null(covar3)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2)
    } else if (is.null(covar4)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
    } else if (is.null(covar5)) {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
    } else {
      mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
    }
    slope <- mod$coefficients[2]
    dB[i] <- as.numeric(slope*range)
  }
  dB
}

betas.funnorm.filt <- m2beta(M_values.funnorm.filt)

delta_beta_Age.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Chip)
length(delta_beta_Age.chip_fixed)
summary(delta_beta_Age.chip_fixed)

save(delta_beta_Age.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.chip_fixed.RData")
##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.chip_fixed.RData")

#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")

#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.chip_fixed)
colnames(Age.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.chip_Table)), as.character(Age.chip_Table$CpG)) #TRUE.
## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.chip_Table[order(Age.chip_Table$Nominal_P),],10)
##                   CpG    Nominal_P       FDR  Delta_Beta
## cg15155209 cg15155209 5.551609e-07 0.1537017 -0.19917332
## cg25359907 cg25359907 1.027507e-06 0.1537017  0.04889796
## cg06968164 cg06968164 1.297871e-06 0.1537017 -0.16568531
## cg19005438 cg19005438 1.445053e-06 0.1537017  0.15735573
## cg06173889 cg06173889 2.267861e-06 0.1619278  0.23625113
## cg22197050 cg22197050 2.610535e-06 0.1619278  0.06999899
## cg09981830 cg09981830 2.882339e-06 0.1619278 -0.15967392
## cg21279207 cg21279207 3.364030e-06 0.1619278  0.04648695
## cg27239981 cg27239981 3.425384e-06 0.1619278  0.17259425
## cg15861585 cg15861585 4.499119e-06 0.1759085  0.22678014
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.chip_Table$Nominal_P, Age.chip_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated
## [1] "Hypermethylated: 5"
## [1] "Hypomethylated: 4"
## Warning: Removed 2 rows containing missing values (geom_point).

#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
Age.chip_Table.annotated = merge(Age.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.chip_Table.annotated <- Age.chip_Table.annotated[order(Age.chip_Table.annotated$Nominal_P),]

#Grabbing the Volcano hits:
LM_Age.chip_Hits <- Age.chip_Table.annotated[which(abs(Age.chip_Table.annotated$Delta_Beta)>0.05 & Age.chip_Table.annotated$Nominal_P<5e-6),]

#Let's order by Nominal_P:
LM_Age.chip_Hits = LM_Age.chip_Hits[order(LM_Age.chip_Hits$Nominal_P),]
rownames(LM_Age.chip_Hits) = c()
str(LM_Age.chip_Hits)
## 'data.frame':    9 obs. of  8 variables:
##  $ CpG               : Factor w/ 425456 levels "cg00000029","cg00000108",..: 246988 119392 301975 106386 345056 167079 416764 256738 137398
##  $ Nominal_P         : num  5.55e-07 1.30e-06 1.45e-06 2.27e-06 2.61e-06 ...
##  $ FDR               : num  0.154 0.154 0.154 0.162 0.162 ...
##  $ Delta_Beta        : num  -0.199 -0.166 0.157 0.236 0.07 ...
##  $ Chromosome        : Factor w/ 25 levels "","1","10","11",..: 17 6 8 13 13 21 20 4 7
##  $ Coordinate        : Factor w/ 3 levels "","F","R": 2 2 2 3 3 3 2 2 2
##  $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 53494 1 20278 54913 30522 1 1 15936 21983
##  $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4133 1 5441 1209 5441 1 1 5441 5441
head(LM_Age.chip_Hits)
##          CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 1 cg15155209 5.551609e-07 0.1537017 -0.19917332          3          F
## 2 cg06968164 1.297871e-06 0.1537017 -0.16568531         13          F
## 3 cg19005438 1.445053e-06 0.1537017  0.15735573         15          F
## 4 cg06173889 2.267861e-06 0.1619278  0.23625113          2          R
## 5 cg22197050 2.610535e-06 0.1619278  0.06999899          2          R
## 6 cg09981830 2.882339e-06 0.1619278 -0.15967392          7          R
##   UCSC_RefGene_Name UCSC_RefGene_Group
## 1     SLC6A6;SLC6A6          Body;Body
## 2                                     
## 3             FOXB1            TSS1500
## 4       SOX11;SOX11      3'UTR;1stExon
## 5      LOC100132215            TSS1500
## 6

Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3

#Reorder row index.
rownames(Age.chip_Table.annotated) <- NULL

#Look at hits based on candidate genes.
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.
##               CpG    Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 490    cg16867657 0.0006480605 0.5222414  0.10186999          6          F
## 5912   cg24724428 0.0102957390 0.6871049  0.09904397          6          F
## 15572  cg21572722 0.0302386080 0.7669489  0.05297685          6          F
## 24589  cg01799681 0.0504981006 0.8099904 -0.06682364          6          F
## 67578  cg16323298 0.1540096275 0.9001507 -0.02915569          6          F
## 118811 cg25151806 0.2820134342 0.9380256  0.01249799          6          R
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 490               ELOVL2            TSS1500
## 5912              ELOVL2            TSS1500
## 15572             ELOVL2            TSS1500
## 24589             ELOVL2               Body
## 67578             ELOVL2            TSS1500
## 118811            ELOVL2            TSS1500
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.
##              CpG Nominal_P       FDR Delta_Beta Chromosome Coordinate
## 48400 cg18964582 0.1069164 0.8722603 0.01701566          1          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 48400           EDARADD            TSS1500
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.
##               CpG  Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 2814   cg07081054 0.00460419 0.6425326 0.063107922         17          R
## 7730   cg25431220 0.01384333 0.7057645 0.017801409         17          F
## 33207  cg05265484 0.07067246 0.8391064 0.070202235         17          F
## 54646  cg10237252 0.12227540 0.8833635 0.101841269         17          R
## 170663 cg03870845 0.41371178 0.9583453 0.008528875         17          F
## 173287 cg13913085 0.42041479 0.9591561 0.044899176         17          F
##        UCSC_RefGene_Name UCSC_RefGene_Group
## 2814              TOM1L1               Body
## 7730              TOM1L1             TSS200
## 33207             TOM1L1            TSS1500
## 54646             TOM1L1            TSS1500
## 170663            TOM1L1             TSS200
## 173287            TOM1L1               Body
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.
##              CpG  Nominal_P       FDR  Delta_Beta Chromosome Coordinate
## 13049 cg02368096 0.02478601 0.7509686  0.06380722          7          R
## 21865 cg13585675 0.04447912 0.8019882  0.05113651          7          F
## 36700 cg13878520 0.07904831 0.8494727  0.03212259          7          R
## 42812 cg05168977 0.09365409 0.8635618 -0.04768416          7          F
## 48517 cg13695954 0.10719414 0.8723142  0.01446438          7          F
## 52482 cg13314145 0.11695986 0.8797918  0.02459323          7          R
##       UCSC_RefGene_Name UCSC_RefGene_Group
## 13049             NPTX2               Body
## 21865             NPTX2               Body
## 36700             NPTX2               Body
## 42812             NPTX2            1stExon
## 48517             NPTX2               Body
## 52482             NPTX2            TSS1500

EWAS Model 7: Candidate approach to look at ELOVL2.

#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData") 

dim(AMD_pData) #44 samples.
## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.
## [1] 425456     44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)

str(AMD_pData)
## 'data.frame':    44 obs. of  22 variables:
##  $ Sample_Name  : chr  "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
##  $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
##  $ Age          : int  61 74 70 76 79 89 66 70 83 76 ...
##  $ Tissue       : chr  "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
##  $ Row          : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
##  $ Chip         : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
##  $ Basename     : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ filenames    : chr  "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
##  $ xMed         : num  11.4 11.7 11.9 11.7 11.6 ...
##  $ yMed         : num  11.7 12 12.1 11.9 11.9 ...
##  $ predictedSex : chr  "M" "M" "M" "M" ...
##  $ SV           : num  -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
##  $ Epithelial   : num  0.387 0.436 0.367 0.413 0.387 ...
##  $ Fibroblast   : num  0.326 0.316 0.298 0.24 0.312 ...
##  $ B_Cell       : num  0.0502 0.0454 0.0582 0.0528 0.0482 ...
##  $ NK_Cell      : num  0.065 0.0601 0.0816 0.0986 0.0743 ...
##  $ CD4T         : num  0.0478 0.0467 0.0545 0.0658 0.0544 ...
##  $ CD8T         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Monocyte     : num  0.0685 0.05 0.081 0.0666 0.0651 ...
##  $ Neutrophil   : num  0 0 0 0 0 ...
##  $ Eosinophil   : num  0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. 
## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values. 
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. 
## [1] TRUE
#Load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)
##                  Name AddressA_ID
## cg07881041 cg07881041  0085713262
## cg18478105 cg18478105  0046761277
## cg23229610 cg23229610  0021717843
## cg03513874 cg03513874  0029622133
## cg09835024 cg09835024  0016745152
## cg05451842 cg05451842  0016681196
##                                              AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC            
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA  0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC            
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC            
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA  0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC            
##                                              AlleleB_ProbeSeq
## cg07881041                                                   
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg07881041                   II                        
## cg18478105                    I         C           Grn
## cg23229610                   II                        
## cg03513874                   II                        
## cg09835024                    I         A           Red
## cg05451842                   II                        
##                                                                                                                        Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
##            Genome_Build CHR   MAPINFO
## cg07881041           37  19   5236016
## cg18478105           37  20  61847650
## cg23229610           37   1   6841125
## cg03513874           37   2 198303466
## cg09835024           37   X  24072640
## cg05451842           37  14  93581139
##                                                     SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG      R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC      R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA      R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG      F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG      R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC      F
##                  UCSC_RefGene_Name                  UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105                  YTHDF1                               NM_017798
## cg23229610                                                                
## cg03513874                                                                
## cg09835024                  EIF2S3                               NM_001415
## cg05451842       ITPK1;ITPK1;ITPK1     NM_001142593;NM_014216;NM_001142594
##             UCSC_RefGene_Group    UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body    chr19:5237294-5237669
## cg18478105              TSS200  chr20:61846843-61848103
## cg23229610                         chr1:6844313-6846366
## cg03513874                     chr2:198299244-198299972
## cg09835024             TSS1500   chrX:24072558-24073135
## cg05451842      Body;Body;Body  chr14:93581083-93582797
##            Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041                     N_Shore                                      
## cg18478105                      Island                                      
## cg23229610                     N_Shelf                                      
## cg03513874                     S_Shelf                                      
## cg09835024                      Island                                      
## cg05451842                      Island                                      
##            DMR X450k_Enhancer           HMM_Island Regulatory_Feature_Name
## cg07881041                 NA                                             
## cg18478105                 NA 20:61317142-61318498    20:61846284-61847956
## cg23229610                 NA                                             
## cg03513874                 NA                                             
## cg09835024                 NA                          X:24071907-24073667
## cg05451842                 NA 14:92650663-92652544                        
##            Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041                                              
## cg18478105      Promoter_Associated        YTHDF1;YTHDF1
## cg23229610                                              
## cg03513874                                              
## cg09835024      Promoter_Associated               EIF2S3
## cg05451842                                         ITPK1
##                      GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041                                                          
## cg18478105 ENST00000370334.4;ENST00000370339.3         TSS200;TSS200
## cg23229610                                                          
## cg03513874                                                          
## cg09835024                   ENST00000253039.4                TSS200
## cg05451842                   ENST00000555495.1                 5'UTR
##             GencodeCompV12_NAME
## cg07881041                     
## cg18478105        YTHDF1;YTHDF1
## cg23229610                     
## cg03513874                     
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842                ITPK1
##                                         GencodeCompV12_Accession
## cg07881041                                                      
## cg18478105                   ENST00000370334.4;ENST00000370339.3
## cg23229610                                                      
## cg03513874                                                      
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842                                     ENST00000555495.1
##              GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041                                                   
## cg18478105          TSS200;TSS200     chr20:61847520-61847755
## cg23229610                                                   
## cg03513874                                                   
## cg09835024 TSS1500;TSS1500;TSS200      chrX:24072600-24073395
## cg05451842                  5'UTR     chr14:93581080-93581375
##            DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041                                    NA                   
## cg18478105                                     3                   
## cg23229610                                    NA                   
## cg03513874                                    NA                   
## cg09835024                                     3                   
## cg05451842                                     3                   
##            OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041                           NA                            NA
## cg18478105                           NA                            NA
## cg23229610                           NA                            NA
## cg03513874                           NA                            NA
## cg09835024                           NA                            NA
## cg05451842                           NA                            NA
##            Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041            NA           TRUE            19       5187016
## cg18478105            NA           TRUE            20      61318095
## cg23229610            NA           TRUE             1       6763712
## cg03513874            NA           TRUE             2     198011711
## cg09835024            NA           TRUE             X      23982561
## cg05451842            NA           TRUE            14      92650892
##                             SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041             rs187313142           18                 0.000200
## cg18478105             rs549944121            5                 0.001797
## cg23229610 rs545824288;rs527255711        40;12        0.000200;0.001198
## cg03513874                                                              
## cg09835024                                                              
## cg05451842             rs550745821           22                 0.000200
##            Random_Loci  X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg18478105          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg23229610          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg03513874          NA NA      +     <NA>        NA   <NA>      NA   <NA>
## cg09835024          NA NA      -     <NA>        NA   <NA>      NA   <NA>
## cg05451842          NA NA      +     <NA>        NA   <NA>      NA   <NA>
##            SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041      NA        No         No      No        No
## cg18478105      NA        No         No      No        No
## cg23229610      NA        No         No      No        No
## cg03513874      NA        No         No      No        No
## cg09835024      NA        No         No      No        No
## cg05451842      NA        No         No      No        No
#Pull out ELOVL2-specific CpG sites.
ELOVL2 <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$UCSC_RefGene_Name == "ELOVL2"),]

#Subset ELOVL2-specific CpG sites.
M_values.funnorm.filt <- M_values.funnorm.filt[which(rownames(M_values.funnorm.filt) %in% rownames(ELOVL2)),]
library(pbapply) #Progress bar for apply functions.
## Warning: package 'pbapply' was built under R version 3.6.3
#EWAS on Age - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.candidate_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
  meta <- AMD_pData
  meta$Mval <- M_values.funnorm.filt[CpG,]
  mod_Age.candidate.row <- lm(Mval ~ Age + SV, data = meta) 
  coef(summary(mod_Age.candidate.row))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.candidate_LM_pval)
## [1] 8.842870e-01 8.694883e-01 2.590172e-02 4.282323e-01 1.398169e-01
## [6] 3.463246e-05
#Inspect p-value distribution for model.
pvalue_dist_Age.candidate <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.candidate_LM_pval)
qqnorm(pvalue_dist_Age.candidate$Nominal_P)

#Right skewed Q-Q plot.

Epigenetic clocks

Step 1: Prepare data to run Horvath, Hannum and PhenoAge clocks.

This is the site with information pertaining to the Hovarth clock input. Note that the site runs all three clocks: Horvath, Hannum and PhenoAge clocks (among other clocks - refer to Horvath’s manual (Location: ~KoborLab/kobor_space/kendrix/macular_degeneration/ManualEpigeneticClock3.pdf)) simultaneously with the same input.

Input: Data frame of betas from noob-normalised MSet object with Sample IDs as columns.

#Horvath data prep function. 
horvathPrep <- function(dat){
  dat <- as.data.frame(dat)
  dat0= cbind(rownames(dat), dat) #need col 1 to be CpG probe ID
  datMiniAnnotation=read.csv("~/KoborLab/kobor_space/shared_coding_resource/Horvath_Age_Prediction/datMiniAnnotation3.csv") #get from horvath website above
  match1=match(datMiniAnnotation[,1], dat0[,1] )
  dat0Reduced=dat0[match1,]
  dat0Reduced[,1]=as.character(dat0Reduced[,1])
  dat0Reduced[is.na(match1),1]= as.character(datMiniAnnotation[is.na(match1),1])
  datout=data.frame(dat0Reduced)
  # make sure you output numeric variables...
  for (i in 2:dim(datout)[[2]]  ){datout[,i]= as.numeric(as.character(gsub(x=datout[,i],pattern="\"",replacement=""))) }
  colnames(datout)[1] <- "Probe"                         
  return(datout)}

#Prepare methylation data for Horvath clock. 
betas.noob <- getBeta(AMD_MSet.noob)
AMD_betas.Horvath <- horvathPrep(betas.noob)

#Match colnames to AMD_samplesheet.
colnames(AMD_betas.Horvath) <- gsub("\\.", " ", colnames(AMD_betas.Horvath))
identical(colnames(AMD_betas.Horvath)[2:45], sampleNames(AMD_ExtendedRGSet)) #Sample order matches.

write.table(AMD_betas.Horvath,"~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_betas.Horvath.csv", row.names = F, sep = "," )

For ‘Advanced Blood Analysis’ (more columns in output including Hannum & Weidner clocks, as well as cell type estimate, etc. you need to create an annotation file to be used with calculating epigenetic age. We need three variables as well as the first column representing the sample names. - Column 1: Sample ID which contains a list of the same ordered names as the methylation file - Column 2: Chronological age as an integer - Column 3: Titled “Female”, male = 0 and female = 1. This is used to validate gender labels - Column 4: “Tissue”, from Horvath’s PDF tutorial.

#Make sample file for Advanced Analysis in Blood. 
Sample_Annotation <- pData(AMD_MSet.noob)
head(Sample_Annotation)

Sample_Annotation <- Sample_Annotation[, c("Sample_Name", "Age", "Sex", "Tissue")]
Sample_Annotation$Sex <- gsub("F", "1", gsub("M", "0", Sample_Annotation$Sex))
colnames(Sample_Annotation)[3] <- "Female"

str(Sample_Annotation)
identical(rownames(Sample_Annotation), colnames(AMD_betas.Horvath)[2:45]) #TRUE.
## [1] TRUE
write.table(Sample_Annotation,"~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Sample_Annotation_Horvath.csv", row.names = F, sep = "," )

Step 2: Explore output of Horvath et al. epigenetics clocks.

After the epigenetic ages are calculated and the output is obtained from the Horvath team, upload the output to the server to be visualised.

#Load necessary objeects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.noob.RData")

AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)
AMD_Epigenetic_Age <- read.csv(file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_betas.output.csv", row.names = 1)
rownames(AMD_Epigenetic_Age) <- AMD_Epigenetic_Age$Sample_Name

#Add Epigenetic Age information to RGSet. 
identical(rownames(pData(AMD_ExtendedRGSet)), rownames(AMD_Epigenetic_Age))
## [1] TRUE
AMD_pData <- as.data.frame(pData(AMD_ExtendedRGSet))
AMD_pData <- join(AMD_pData, AMD_Epigenetic_Age, by = "Sample_Name")
rownames(AMD_pData) <- AMD_pData$Sample_Name
identical(rownames(pData(AMD_ExtendedRGSet)), rownames(AMD_pData)) #Check to make sure the order of the samples that are merged is correct.
## [1] TRUE

Step 3: Horvath clock.

The Horvath clock is a pan-tissue epigenetic clock that will predict the DNAm age of tissues based on 353 CpG sites that were developed from 8,000 samples from 82 datasets.

Note that even though the Horvath clock was trained using multiple tissues, none of the tissues are retina tissues that are in this dataset.

Error: +/- 3.6 years.

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_Age_Horvath <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAge", "AgeAccelerationDiff", "AgeAccelerationResidual")] 

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Horvath$Sex, shape = AMD_Epigenetic_Age_Horvath$Sample_Group)) +
  guides(size = FALSE) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Horvath predicted DNAmAge against reported age:
AMD_HO_plot <- lm(DNAmAge ~ Age, data = AMD_Epigenetic_Age_Horvath)
ggplotRegression(AMD_HO_plot) + labs(x = "Reported age", y = "Horvath predicted age", caption = "Comparing reported age and predicted age using Horvath epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab"), labels = c("Female", "Male")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Horvath$Sample_Group <- factor(AMD_Epigenetic_Age_Horvath$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Horvath, 
       aes(x = Sample_Group, y = AgeAccelerationResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Horvath Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

AMD <- subset(AMD_Epigenetic_Age_Horvath, Sample_Group == "age-related macular degeneration")
normal <- subset(AMD_Epigenetic_Age_Horvath, Sample_Group == "normal")

var.test(AMD$AgeAccelerationResidual, normal$AgeAccelerationResidual)
## 
##  F test to compare two variances
## 
## data:  AMD$AgeAccelerationResidual and normal$AgeAccelerationResidual
## F = 1.5411, num df = 24, denom df = 18, p-value = 0.35
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6157627 3.6443110
## sample estimates:
## ratio of variances 
##           1.541067
#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Horvath$Sex <- factor(AMD_Epigenetic_Age_Horvath$Sex)
ggplot(data = AMD_Epigenetic_Age_Horvath, 
       aes(x = Sample_Group, y = AgeAccelerationResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Horvath Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Calculate mean and difference for Bland-Altman plot.
AMD_Epigenetic_Age_Horvath$Mean <- (AMD_Epigenetic_Age_Horvath$DNAmAge + AMD_Epigenetic_Age_Horvath$Age)/2
AMD_Epigenetic_Age_Horvath$Difference <- AMD_Epigenetic_Age_Horvath$DNAmAge - AMD_Epigenetic_Age_Horvath$Age

ggplot(AMD_Epigenetic_Age_Horvath, aes(Mean, Difference)) + geom_point(shape = 0, size = 3) + ylim(-60, 25) + xlab("Mean (years)") + ylab("Difference (years)") + geom_hline(yintercept = 14.76577) + geom_hline(yintercept = -14.76577) + theme_classic()

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmAge ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
## 
## Call:
## lm(formula = DNAmAge ~ Sample_Group + Sex + Sex * Sample_Group, 
##     data = AMD_Epigenetic_Age_Horvath)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2281  -3.4734   0.5236   4.7642  16.6614 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              33.0360     2.3430  14.100   <2e-16 ***
## Sample_Groupnormal        0.6401     3.6514   0.175    0.862    
## SexM                      1.0628     3.0248   0.351    0.727    
## Sample_Groupnormal:SexM  -3.1902     4.6440  -0.687    0.496    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.409 on 40 degrees of freedom
## Multiple R-squared:  0.02061,    Adjusted R-squared:  -0.05285 
## F-statistic: 0.2805 on 3 and 40 DF,  p-value: 0.8391
summary(aov(DNAmAge ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
##                  Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group      1   19.4   19.42   0.354  0.555
## Sex               1    0.9    0.88   0.016  0.900
## Sample_Group:Sex  1   25.9   25.91   0.472  0.496
## Residuals        40 2195.9   54.90
summary(lm(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
## 
## Call:
## lm(formula = AgeAccelerationResidual ~ Sample_Group + Sex + Sex * 
##     Sample_Group, data = AMD_Epigenetic_Age_Horvath)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9235  -3.3887  -0.6178   4.1704  13.0694 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)              -1.6176     1.9490  -0.830    0.411
## Sample_Groupnormal        0.3858     3.0374   0.127    0.900
## SexM                      3.6446     2.5162   1.448    0.155
## Sample_Groupnormal:SexM  -2.8798     3.8631  -0.745    0.460
## 
## Residual standard error: 6.163 on 40 degrees of freedom
## Multiple R-squared:  0.06235,    Adjusted R-squared:  -0.007979 
## F-statistic: 0.8865 on 3 and 40 DF,  p-value: 0.4563
summary(aov(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
##                  Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group      1   18.7   18.75   0.494  0.486
## Sex               1   61.2   61.17   1.610  0.212
## Sample_Group:Sex  1   21.1   21.11   0.556  0.460
## Residuals        40 1519.5   37.99
#Determine the statistical significance male AMD vs. normal. 
Horvath_males <- subset(AMD_Epigenetic_Age_Horvath, Sex == "M")

#Preliminary checks before performing t-test.

#Test for normality:
#The Shapiro-Wilk test to check if the data of interest is normality distribution. 
#Null hypothesis = The data are normally distribution.
with(Horvath_males, shapiro.test(AgeAccelerationResidual[Sample_Group == "age-related macular degeneration"])) #p-value = 0.2265 which is greater than 0.05, implying the distribution of the data is not significantly different from normal distribution. In other words, assume normality.
## 
##  Shapiro-Wilk normality test
## 
## data:  AgeAccelerationResidual[Sample_Group == "age-related macular degeneration"]
## W = 0.92462, p-value = 0.2265
with(Horvath_males, shapiro.test(AgeAccelerationResidual[Sample_Group == "normal"])) #p-value = 0.182 which is greater than 0.05, implying the distribution of the data is not significantly different from normal distribution. In other words, assume normality.
## 
##  Shapiro-Wilk normality test
## 
## data:  AgeAccelerationResidual[Sample_Group == "normal"]
## W = 0.90463, p-value = 0.182
#Test for distribution of variances.
#Use F-test to test for homogeneity of variances. 
var.test(AgeAccelerationResidual ~ Sample_Group, data = Horvath_males) #p-value = 0.6034, which is greater than 0.05, implying no significant difference between the variances of both sets of data. 
## 
##  F test to compare two variances
## 
## data:  AgeAccelerationResidual by Sample_Group
## F = 1.3745, num df = 14, denom df = 11, p-value = 0.6034
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4092129 4.2534156
## sample estimates:
## ratio of variances 
##           1.374468
#Since both sets of data show normality and equality between variances, we can do classic t-test that requires those assumptions of normality and variance. 
t.test(AgeAccelerationResidual ~ Sample_Group, data = Horvath_males, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  AgeAccelerationResidual by Sample_Group
## t = 1.0394, df = 25, p-value = 0.3086
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.447938  7.435825
## sample estimates:
## mean in group age-related macular degeneration 
##                                      2.0269082 
##                           mean in group normal 
##                                     -0.4670351
#p-value = 0.3086, which is greater than 0.05. Therefore, the age-acceleration residual in AMD male is not statistically more signficant as compared to normal male.
library(effects)
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.6.3
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.
LM_Horvath_Residual <- lm(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath)

#Look at the interaction effects.
Disease_Sex_Interact <- as.data.frame(effect("Sample_Group*Sex", LM_Horvath_Residual, se = TRUE))

#Interaction plot.
ggplot(Disease_Sex_Interact, aes(x = Sample_Group, y = fit, group = Sex)) + 
  geom_line(size = 2, aes(color = Sex)) + 
  geom_ribbon(aes(ymin = fit-se, ymax = fit+se, fill = Sex),alpha = .2) + 
  scale_color_manual(values = c("#990000", "#2c7dab")) +
  ylab("Horvath Age Acceleration Residual") + xlab("Disease State") + 
  theme_bw() + 
  theme(text = element_text(size = 12),
        legend.text = element_text(size = 12),
        legend.direction = "horizontal",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "top")

Step 4: Winsorized Horvath clock.

DNAm age acceleration residual data is winsorized (i.e. by shrinking outlying observations to the border of the main part of the data) before plotting.

#Windsorize age acceleration data.
AMD_Epigenetic_Age_Horvath <- AMD_Epigenetic_Age_Horvath %>% mutate(Winz_AgeAccelResidual = winsorize(AgeAccelerationResidual))
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Horvath$Sample_Group <- factor(AMD_Epigenetic_Age_Horvath$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Horvath, 
       aes(x = Sample_Group, y = Winz_AgeAccelResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Winsorized Horvath Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Horvath$Sex <- factor(AMD_Epigenetic_Age_Horvath$Sex)
ggplot(data = AMD_Epigenetic_Age_Horvath, 
       aes(x = Sample_Group, y = Winz_AgeAccelResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Winsorized Horvath Age Accel Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control that is stratified by sex with ANOVA.
#Compute the analysis of variance
summary(lm(Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
## 
## Call:
## lm(formula = Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex * 
##     Sample_Group, data = AMD_Epigenetic_Age_Horvath)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1470  -3.2248  -0.4513   3.9223   9.0518 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)              -1.5582     1.7825  -0.874    0.387
## Sample_Groupnormal        0.3264     2.7778   0.117    0.907
## SexM                      3.3468     2.3012   1.454    0.154
## Sample_Groupnormal:SexM  -2.2711     3.5330  -0.643    0.524
## 
## Residual standard error: 5.637 on 40 degrees of freedom
## Multiple R-squared:  0.06142,    Adjusted R-squared:  -0.008974 
## F-statistic: 0.8725 on 3 and 40 DF,  p-value: 0.4634
summary(aov(Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))
##                  Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group      1   10.8   10.85   0.341  0.562
## Sex               1   59.2   59.19   1.863  0.180
## Sample_Group:Sex  1   13.1   13.13   0.413  0.524
## Residuals        40 1270.9   31.77

Step 5: Hannum clock.

Built from the whole blood of 656 human individuals, aged 19 to 101, this clock, developed by Hannum et al. looks at 71 CpG sites that are highly predictive of age.

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_Age_Hannum <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAgeHannum", "AgeAccelerationResidualHannum")] 

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Hannum$Sex, shape = AMD_Epigenetic_Age_Hannum$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Hannum predicted DNAmAge against reported age:
AMD_HA_plot <- lm(DNAmAgeHannum ~ Age, data = AMD_Epigenetic_Age_Hannum)
ggplotRegression(AMD_HA_plot) + labs(x = "Reported age", y = "Hannum predicted age", caption = "Comparing reported age and predicted age using Hannum epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Hannum$Sample_Group <- factor(AMD_Epigenetic_Age_Hannum$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Hannum, 
       aes(x = Sample_Group, y = AgeAccelerationResidualHannum, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Hannum Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Hannum$Sex <- factor(AMD_Epigenetic_Age_Hannum$Sex)
ggplot(data = AMD_Epigenetic_Age_Hannum, 
       aes(x = Sample_Group, y = AgeAccelerationResidualHannum, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Hannum Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelerationResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))
## 
## Call:
## lm(formula = AgeAccelerationResidualHannum ~ Sample_Group + Sex + 
##     Sex * Sample_Group, data = AMD_Epigenetic_Age_Hannum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.807  -3.666   0.238   3.789  12.035 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -2.9414     1.8490  -1.591   0.1195  
## Sample_Groupnormal       -0.5568     2.8815  -0.193   0.8478  
## SexM                      5.8651     2.3871   2.457   0.0184 *
## Sample_Groupnormal:SexM  -1.5297     3.6649  -0.417   0.6786  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.847 on 40 degrees of freedom
## Multiple R-squared:  0.1842, Adjusted R-squared:  0.123 
## F-statistic: 3.011 on 3 and 40 DF,  p-value: 0.04125
summary(aov(AgeAccelerationResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))
##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## Sample_Group      1   19.3   19.32   0.565 0.45662   
## Sex               1  283.5  283.54   8.293 0.00636 **
## Sample_Group:Sex  1    6.0    5.96   0.174 0.67862   
## Residuals        40 1367.6   34.19                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 6: Winsorized Hannum clock.

DNAm age acceleration residual data is winsorized (i.e. by shrinking outlying observations to the border of the main part of the data) before plotting.

#Windsorize age acceleration data.
AMD_Epigenetic_Age_Hannum <- AMD_Epigenetic_Age_Hannum %>% mutate(Winz_AgeAccelResidualHannum = winsorize(AgeAccelerationResidualHannum))
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Hannum$Sample_Group <- factor(AMD_Epigenetic_Age_Hannum$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Hannum, 
       aes(x = Sample_Group, y = Winz_AgeAccelResidualHannum, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Winsorized Hannum Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Hannum$Sex <- factor(AMD_Epigenetic_Age_Hannum$Sex)
ggplot(data = AMD_Epigenetic_Age_Hannum, 
       aes(x = Sample_Group, y = Winz_AgeAccelResidualHannum, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Winsorized Hannum Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control that is stratified by sex with ANOVA.
#Compute the analysis of variance
summary(lm(Winz_AgeAccelResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))
## 
## Call:
## lm(formula = Winz_AgeAccelResidualHannum ~ Sample_Group + Sex + 
##     Sex * Sample_Group, data = AMD_Epigenetic_Age_Hannum)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.6332  -3.6662   0.3387   3.7893  11.6195 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -2.5259     1.7583  -1.437   0.1586  
## Sample_Groupnormal       -0.9723     2.7401  -0.355   0.7246  
## SexM                      5.2760     2.2699   2.324   0.0253 *
## Sample_Groupnormal:SexM  -0.9407     3.4850  -0.270   0.7886  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.56 on 40 degrees of freedom
## Multiple R-squared:  0.1799, Adjusted R-squared:  0.1184 
## F-statistic: 2.925 on 3 and 40 DF,  p-value: 0.04541
summary(aov(Winz_AgeAccelResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))
##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## Sample_Group      1   21.2   21.15   0.684 0.41305   
## Sex               1  247.9  247.86   8.017 0.00722 **
## Sample_Group:Sex  1    2.3    2.25   0.073 0.78862   
## Residuals        40 1236.6   30.92                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 7: PhenoAge clock.

Developed from 513 CpG sites, this clock was built by Levine et al. to serve as prediction of a surrogate measure of “phenotypic age” that, in and of itself, differentiates morbidity and mortality risk among same-age individuals (optimized to predict a multi-system proxy of physiological dysregulation (phenotypic age)). The clock shares 41 CpGs in the Horvath clock and 5 CpGs in the Hannum clock (which was also present in the Horvath clock).

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_PhenoAge <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmPhenoAge", "AgeAccelPheno")] 

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_PhenoAge$Sex, shape = AMD_Epigenetic_PhenoAge$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of PhenoAge predicted DNAmAge against reported age:
AMD_PhenoAge_plot <- lm(DNAmPhenoAge ~ Age, data = AMD_Epigenetic_PhenoAge)
ggplotRegression(AMD_PhenoAge_plot) + labs(x = "Reported age", y = "PhenoAge predicted age", caption = "Comparing reported age and predicted age using PhenoAge epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_PhenoAge$Sample_Group <- factor(AMD_Epigenetic_PhenoAge$Sample_Group)
ggplot(data = AMD_Epigenetic_PhenoAge, 
       aes(x = Sample_Group, y = AgeAccelPheno, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("PhenoAge Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_PhenoAge$Sex <- factor(AMD_Epigenetic_PhenoAge$Sex)
ggplot(data = AMD_Epigenetic_PhenoAge, 
       aes(x = Sample_Group, y = AgeAccelPheno, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("PhenoAge Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelPheno ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_PhenoAge))
## 
## Call:
## lm(formula = AgeAccelPheno ~ Sample_Group + Sex + Sex * Sample_Group, 
##     data = AMD_Epigenetic_PhenoAge)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.161  -7.392   4.113   8.431  22.159 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -6.3813     4.6761  -1.365   0.1800  
## Sample_Groupnormal       -2.6576     7.2871  -0.365   0.7173  
## SexM                     11.9156     6.0368   1.974   0.0553 .
## Sample_Groupnormal:SexM   0.7959     9.2683   0.086   0.9320  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.79 on 40 degrees of freedom
## Multiple R-squared:  0.1547, Adjusted R-squared:  0.09128 
## F-statistic:  2.44 on 3 and 40 DF,  p-value: 0.07843
summary(aov(AgeAccelPheno ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_PhenoAge))
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## Sample_Group      1     34    34.2   0.156 0.6948  
## Sex               1   1565  1564.6   7.156 0.0108 *
## Sample_Group:Sex  1      2     1.6   0.007 0.9320  
## Residuals        40   8746   218.7                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 8: Skin & Blood clock.

The Skin & Blood clock (based on 391 CpGs) was developed to accurately measure the age of human fibroblasts, keratinocytes, buccal cells, endothelial cells, skin and blood samples. This clock shares 45 CpGs (out of 71 CpGs) with the blood-based clock from Hannum (2013) and 60 CpGs (out of 353 CpGs) with the pan tissue clock from Horvath (2013). Despite this significant overlap, epigenetic age acceleration of the skin & blood clock exhibits only moderate correlations with corresponding epigenetic age acceleration measures by Horvath (2013) and Hannum (2013) (r=0.5 and r=0.59, p<1.E-110) in the blood samples from the Women’s Health Initiative (BA23 study).

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_AgeSkinBlood <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAgeSkinBloodClock")] 
AMD_Epigenetic_AgeSkinBlood$AgeAccelSkinBlood <- lm(DNAmAgeSkinBloodClock ~ Age, data = AMD_Epigenetic_AgeSkinBlood)$residuals

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_AgeSkinBlood$Sex, shape = AMD_Epigenetic_AgeSkinBlood$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Skin & Blood clock predicted DNAmAge against reported age:
AMD_AgeSkinBlood_plot <- lm(DNAmAgeSkinBloodClock ~ Age, data = AMD_Epigenetic_AgeSkinBlood)
ggplotRegression(AMD_AgeSkinBlood_plot) + labs(x = "Reported age", y = "Skin & Blood predicted age", caption = "Comparing reported age and predicted age using Skin & Blood epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_AgeSkinBlood$Sample_Group <- factor(AMD_Epigenetic_AgeSkinBlood$Sample_Group)
ggplot(data = AMD_Epigenetic_AgeSkinBlood, 
       aes(x = Sample_Group, y = AgeAccelSkinBlood, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Skin & Blood Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_AgeSkinBlood$Sex <- factor(AMD_Epigenetic_AgeSkinBlood$Sex)
ggplot(data = AMD_Epigenetic_AgeSkinBlood, 
       aes(x = Sample_Group, y = AgeAccelSkinBlood, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Skin & Blood Age Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelSkinBlood ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_AgeSkinBlood))
## 
## Call:
## lm(formula = AgeAccelSkinBlood ~ Sample_Group + Sex + Sex * Sample_Group, 
##     data = AMD_Epigenetic_AgeSkinBlood)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.8670  -6.9018   0.5594   4.7605  15.4146 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)              -2.1484     2.3529  -0.913    0.367
## Sample_Groupnormal       -1.2454     3.6668  -0.340    0.736
## SexM                      4.6068     3.0376   1.517    0.137
## Sample_Groupnormal:SexM  -0.5158     4.6636  -0.111    0.912
## 
## Residual standard error: 7.441 on 40 degrees of freedom
## Multiple R-squared:  0.09159,    Adjusted R-squared:  0.02346 
## F-statistic: 1.344 on 3 and 40 DF,  p-value: 0.2736
summary(aov(AgeAccelSkinBlood ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_AgeSkinBlood))
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## Sample_Group      1   21.9   21.94   0.396 0.5326  
## Sex               1  200.7  200.65   3.624 0.0642 .
## Sample_Group:Sex  1    0.7    0.68   0.012 0.9125  
## Residuals        40 2214.5   55.36                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 9: Prepare data for Zhang clock.

AMD_Epigenetic_Age_Zhang <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]

AMD_betas.Zhang <- getBeta(AMD_MSet.noob)

Step 10: Run Zhang clock.

Adapted from predictor code for Zhang clock.

This script is used to do age prediction based on Illumina 450K DNA methylation data. Coefficients of the predictor are based on 13,566 training samples. Two age predictors based on different methods (Elastic Net and BLUP) are developed based on 514 DNAm sites.

#Function changes missing value to mean value across all individuals for each probe. 
addna <- function(methy){
         methy[is.na(methy)]<-mean(methy,na.rm=T)
         return(methy)
}

#Load beta matrix and check format.
if(nrow(AMD_betas.Zhang) > ncol(AMD_betas.Zhang)){
        AMD_betas.Zhang <- t(AMD_betas.Zhang)
}

#Replace any NA with mean value for each probe.
dataNona <- apply(AMD_betas.Zhang, 2, function(x) addna(x)) 

#Standardise beta values within each individual by removing the mean and dividing by standard deviation for each individual. (Probe * IND).
dataNona.norm <- apply(dataNona, 1, scale)
rownames(dataNona.norm) <- colnames(dataNona)

#Get the ElasticNet predictor coefficient of each probe.
encoef <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/Zhang_Age_Predictor/en.coef", 
                     stringsAsFactor = FALSE, header = TRUE)
en_int <- encoef[1,2]
encoef <- encoef[-1,]
rownames(encoef) <- encoef$probe

#Get the BLUP coefficient of each probe.
blupcoef <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/Zhang_Age_Predictor/blup.coef",
                       stringsAsFactor = FALSE, header = TRUE)
blup_int <- blupcoef[1,2]
blupcoef <- blupcoef[-1,]
rownames(blupcoef) <- blupcoef$probe

#Get common probes between predictors and data.
encomm <- intersect(rownames(encoef), rownames(dataNona.norm))
blupcomm <- intersect(rownames(blupcoef), rownames(dataNona.norm))

#Check for missing probes.
endiff <- nrow(encoef) - length(encomm)
blupdiff <- nrow(blupcoef) - length(blupcomm)

print(paste0 (endiff, " probe(s) in Elastic Net predictor is(are) not in the data"))
## [1] "0 probe(s) in Elastic Net predictor is(are) not in the data"
print(paste0 (blupdiff, " probe(s) in BLUP predictor is(are) not in the data"))
## [1] "0 probe(s) in BLUP predictor is(are) not in the data"
print("BLUP can perform better if the number of missing probes is too large!")
## [1] "BLUP can perform better if the number of missing probes is too large!"
#Extract common probes and perform age prediction.
encoef <- encoef[encomm,]
blupcoef <- blupcoef[blupcomm,]
ElasticNetPredAge <- encoef$coef%*%dataNona.norm[encomm,] + en_int
BLUPPredAge <- blupcoef$coef%*%dataNona.norm[blupcomm,] + blup_int

#Add predicted results to metadata. 
ElasticNetPredAge <- ElasticNetPredAge[, AMD_Epigenetic_Age_Zhang$Sample_Name]
AMD_Epigenetic_Age_Zhang$ElasticNetPredAge <- as.double(ElasticNetPredAge)
AMD_Epigenetic_Age_Zhang$ElasticNetAgeAccelResidual <- lm(ElasticNetPredAge ~ Age, 
                                                          data = AMD_Epigenetic_Age_Zhang)$residuals

BLUPPredAge <- BLUPPredAge[, AMD_Epigenetic_Age_Zhang$Sample_Name]
AMD_Epigenetic_Age_Zhang$BLUPPredAge <- as.double(BLUPPredAge)
AMD_Epigenetic_Age_Zhang$BLUPAgeAccelResidual <- lm(BLUPPredAge ~ Age, 
                                                    data = AMD_Epigenetic_Age_Zhang)$residuals

Step 11: Zhang clock - ElasticNet method.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Zhang$Sex, shape = AMD_Epigenetic_Age_Zhang$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_ENZhang_plot <- lm(ElasticNetPredAge ~ Age, data = AMD_Epigenetic_Age_Zhang)
ggplotRegression(AMD_ENZhang_plot) + labs(x = "Reported age", y = "Zhang predicted age", caption = "Comparing reported age and predicted age using Zhang epigenetic clock - ElasticNet method") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Zhang$Sample_Group <- factor(AMD_Epigenetic_Age_Zhang$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Zhang, 
       aes(x = Sample_Group, y = ElasticNetAgeAccelResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Zhang Acceleration Residual (ElasticNet)") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Zhang$Sex <- factor(AMD_Epigenetic_Age_Zhang$Sex)
ggplot(data = AMD_Epigenetic_Age_Zhang, 
       aes(x = Sample_Group, y = ElasticNetAgeAccelResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Zhang Acceleration Residual (ElasticNet)") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(ElasticNetAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))
## 
## Call:
## lm(formula = ElasticNetAgeAccelResidual ~ Sample_Group + Sex + 
##     Sex * Sample_Group, data = AMD_Epigenetic_Age_Zhang)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.356  -5.762   2.333   4.889  12.997 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -4.7594     2.4807  -1.919   0.0622 .
## Sample_Groupnormal        0.1734     3.8658   0.045   0.9645  
## SexM                      8.2229     3.2025   2.568   0.0141 *
## Sample_Groupnormal:SexM  -1.3250     4.9168  -0.269   0.7889  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.845 on 40 degrees of freedom
## Multiple R-squared:  0.2006, Adjusted R-squared:  0.1407 
## F-statistic: 3.347 on 3 and 40 DF,  p-value: 0.02842
summary(aov(ElasticNetAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))
##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## Sample_Group      1    1.8     1.8   0.029 0.86654   
## Sex               1  611.6   611.6   9.939 0.00306 **
## Sample_Group:Sex  1    4.5     4.5   0.073 0.78894   
## Residuals        40 2461.5    61.5                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 12: Zhang clock - BLUP method.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Zhang$Sex, shape = AMD_Epigenetic_Age_Zhang$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_BLUPZhang_plot <- lm(BLUPPredAge ~ Age, data = AMD_Epigenetic_Age_Zhang)
ggplotRegression(AMD_BLUPZhang_plot) + labs(x = "Reported age", y = "Zhang predicted age", caption = "Comparing reported age and predicted age using Zhang epigenetic clock - BLUP method") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Zhang$Sample_Group <- factor(AMD_Epigenetic_Age_Zhang$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Zhang, 
       aes(x = Sample_Group, y = BLUPAgeAccelResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Zhang Acceleration Residual (BLUP)") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Zhang$Sex <- factor(AMD_Epigenetic_Age_Zhang$Sex)
ggplot(data = AMD_Epigenetic_Age_Zhang, 
       aes(x = Sample_Group, y = BLUPAgeAccelResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("Zhang Acceleration Residual (BLUP)") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))
## 
## Call:
## lm(formula = BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex * 
##     Sample_Group, data = AMD_Epigenetic_Age_Zhang)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.261  -3.967   1.590   3.561  11.936 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -2.9243     1.8955  -1.543   0.1308  
## Sample_Groupnormal       -1.1535     2.9539  -0.390   0.6982  
## SexM                      5.7728     2.4471   2.359   0.0233 *
## Sample_Groupnormal:SexM  -0.4401     3.7570  -0.117   0.9073  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.994 on 40 degrees of freedom
## Multiple R-squared:  0.1925, Adjusted R-squared:  0.1319 
## F-statistic: 3.178 on 3 and 40 DF,  p-value: 0.03426
summary(aov(BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))
##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## Sample_Group      1   16.8    16.8   0.469 0.49747   
## Sex               1  325.2   325.2   9.051 0.00453 **
## Sample_Group:Sex  1    0.5     0.5   0.014 0.90734   
## Residuals        40 1437.1    35.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 13: Run DNAmCortical clock.

This script is used to do age prediction based on Illumina 450K DNA methylation data. Coefficients of the predictor are based on 1,397 cortical tissues (Age range: 1 - 104 years old). Shireby et al. identified 347 DNAm sites that, in combination optimally predict age in the human cortex using elastic net regression.

#Get beta matrix.
AMD_betas.Cortical <- getBeta(AMD_MSet.noob)

#Check if sample name order match in pData and beta matrix.
identical(rownames(AMD_pData), colnames(AMD_betas.Cortical)) #TRUE.
## [1] TRUE
#Run DNAmCortical clock.
source("~/KoborLab/kobor_space/kendrix/macular_degeneration/CorticalClock/CorticalClock.r")
setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/")

CorticalClock(betas = AMD_betas.Cortical, pheno = AMD_pData, 
              dir = "~/KoborLab/kobor_space/kendrix/macular_degeneration/CorticalClock/", IDcol = "Sample_Name", 
              Agecol = "Age")
## [1] "all the probes overlap between your data and the clock probes - no need for imputing missing values"
##                 Cortical Clock
## Correlation (r)           0.45
## RMSE (years)             43.94
## MAD (years)              43.29
## [1] "Your Cortical DNAm ages are saved in the file 'CorticalPred.csv'"
## [1] "Accuracy statistics comparing DNAm age and chronological age are saved in the file 'accuracy_statistics.csv'"
## [1] "Plot of Cortical DNAm age against age is saved in the file 'CorticalClockplot.pdf'"
#Add predicted values to the cortical pData.
AMD_Epigenetic_Age_Cortical <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
DNAmCortical <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/CorticalPred.csv", header = TRUE)

identical(AMD_Epigenetic_Age_Cortical$Sample_Name, DNAmCortical$ID) #TRUE.
## [1] TRUE
AMD_Epigenetic_Age_Cortical$DNAmCorticalPredAge <- DNAmCortical$brainpred
AMD_Epigenetic_Age_Cortical$DNAmCorticalAgeAccelResidual <- lm(DNAmCorticalPredAge ~ Age, 
                                                          data = AMD_Epigenetic_Age_Cortical)$residuals

Step 14: DNAmCortical clock.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Cortical$Sex, shape = AMD_Epigenetic_Age_Cortical$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_Cortical_plot <- lm(DNAmCorticalPredAge ~ Age, data = AMD_Epigenetic_Age_Cortical)
ggplotRegression(AMD_Cortical_plot) + labs(x = "Reported age", y = "DNAmCortical predicted age", caption = "Comparing reported age and predicted age using DNAmCortical epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Cortical$Sample_Group <- factor(AMD_Epigenetic_Age_Cortical$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Cortical, 
       aes(x = Sample_Group, y = DNAmCorticalAgeAccelResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmCortical Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Cortical$Sex <- factor(AMD_Epigenetic_Age_Cortical$Sex)
ggplot(data = AMD_Epigenetic_Age_Cortical, 
       aes(x = Sample_Group, y = DNAmCorticalAgeAccelResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmCortical Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Cortical))
## 
## Call:
## lm(formula = DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex + 
##     Sex * Sample_Group, data = AMD_Epigenetic_Age_Cortical)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.0785  -4.8258  -0.0874   4.3463  21.7664 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)               -1.947      2.675  -0.728    0.471
## Sample_Groupnormal        -4.298      4.169  -1.031    0.309
## SexM                       5.585      3.454   1.617    0.114
## Sample_Groupnormal:SexM    1.379      5.303   0.260    0.796
## 
## Residual standard error: 8.46 on 40 degrees of freedom
## Multiple R-squared:  0.1526, Adjusted R-squared:  0.08907 
## F-statistic: 2.401 on 3 and 40 DF,  p-value: 0.08191
summary(aov(DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Cortical))
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## Sample_Group      1  114.1   114.1   1.594 0.2141  
## Sex               1  396.7   396.7   5.543 0.0236 *
## Sample_Group:Sex  1    4.8     4.8   0.068 0.7962  
## Residuals        40 2863.0    71.6                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 15: Prepare data for PedBE clock.

AMD_Epigenetic_Age_PedBE <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]

AMD_betas.PedBE <- getBeta(AMD_MSet.noob)

Step 16: Run PedBE clock.

Sourced and adapted from “~/KoborLab/kobor_space/shared_coding_resource/Ped_Age_Predictor_McEwen/Age_Pred_McEwen_NG.R”.

PedAge_McEwen_NG <- function(dat0){

#call required libraries  
  require(WGCNA)
  require(sqldf)
  require(impute)
  require(flashClust)
  require(dynamicTreeCut)
  require(RPMM)

source("~/KoborLab/kobor_space/shared_coding_resource/Horvath_Age_Prediction/NORMALIZATION.R")
datClock <- read.csv("~/KoborLab/kobor_space/shared_coding_resource/Ped_Age_Predictor_McEwen/datcoefInteresting94.csv", stringsAsFactors = F)  

# Impute missing data 
if(sum(is.na(dat0)) > 0) {dat0=data.frame(t(impute.knn(as.matrix(t(dat0)))$data))}

# How many probes are missing from the clock
print(paste("You have", length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]]), "out of", length(datClock$ID[2:nrow(datClock)]), "probes needed for the age clock. Missing", length(datClock$ID[2:nrow(datClock)]) - length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]]), "altogether."))
print(paste("Resulting in", round(100-100*(length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]])/length(datClock$ID[2:nrow(datClock)])), 0),"% missingness. Data will be imputed using training data."))

# Add probe names as the first column
dat0 <- as.data.frame(dat0)
dat0$ProbeID <- rownames(dat0)
dat0 <- dat0[, c(ncol(dat0), 1:(ncol(dat0)-1))]
nSamples <- dim(dat0)[[2]]-1
nProbes <- dim(dat0)[[1]]  

#Normalize data
dat1 <- dat0[,-1]
gold.mean <- as.numeric(apply(dat1, 1, median, na.rm = TRUE))
datMethUsed <- t(dat1)
datMethUsedNormalized <- BMIQcalibration(datM = datMethUsed, goldstandard.beta = gold.mean, plots = FALSE)
datM <- t(datMethUsedNormalized)
datM <- as.data.frame(datM)
datM$ProbeID <- rownames(datM)
datM <- datM[, c(ncol(datM), 1:(ncol(datM)-1))]
datMt <- t(datM[,-1])
colnames(datMt) <- as.character(datM[,1])
anti.trafo <- function(x,adult.age=20) {ifelse(x<0, (1+adult.age)*exp(x)-1, (1+adult.age)*x+adult.age)}
selectCpGsClock <- is.element(dimnames(datMt)[[2]], as.character(datClock[,1][-1]))
datMethClock0 <- data.frame(datMt[,selectCpGsClock])
datMethClock <- data.frame(datMethClock0[as.character(datClock[,1][-1])])
datMethClock$DNAmAgeBuccal <- as.numeric(anti.trafo(datClock[1,2]+as.numeric(as.matrix(datMethClock)%*%as.numeric(datClock[,2][-1]))))
datout <- data.frame(SampleID <- rownames(datMethClock), DNAmAge <- datMethClock$DNAmAgeBuccal)
}
dim(AMD_betas.PedBE) #485512 rows.
## [1] 485512     44
#Remove rows with more than 50% NA
AMD_betas.PedBE <- AMD_betas.PedBE[which(rowMeans(!is.na(AMD_betas.PedBE)) > 0.5), ]

#Run PedBE clock.
PedBE_DNAmAge <- PedAge_McEwen_NG(AMD_betas.PedBE)
## Loading required package: WGCNA
## Warning: package 'WGCNA' was built under R version 3.6.3
## Loading required package: dynamicTreeCut
## Warning: package 'dynamicTreeCut' was built under R version 3.6.3
## Loading required package: fastcluster
## Warning: package 'fastcluster' was built under R version 3.6.3
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## Attaching package: 'WGCNA'
## The following object is masked from 'package:IRanges':
## 
##     cor
## The following object is masked from 'package:S4Vectors':
## 
##     cor
## The following object is masked from 'package:stats':
## 
##     cor
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Loading required package: flashClust
## Warning: package 'flashClust' was built under R version 3.6.3
## 
## Attaching package: 'flashClust'
## The following object is masked from 'package:fastcluster':
## 
##     hclust
## The following object is masked from 'package:stats':
## 
##     hclust
## Loading required package: RPMM
## Warning: package 'RPMM' was built under R version 3.6.3
## Loading required package: cluster
## [1] "You have 94 out of 94 probes needed for the age clock. Missing 0 altogether."
## [1] "Resulting in 0 % missingness. Data will be imputed using training data."
## [1] "Fitting EM beta mixture to goldstandard probes"
## [1] Inf
## [1] 0.004582211
## [1] 0.002934143
## [1] 0.004397809
## [1] 0.004897824
## [1] "Done"
## ii= 1
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004790258
## [1] 0.002362019
## [1] 0.003154597
## [1] 0.003574
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 2
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004211355
## [1] 0.002087919
## [1] 0.003216463
## [1] 0.003465551
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 3
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004627178
## [1] 0.002259131
## [1] 0.002849442
## [1] 0.003346199
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 4
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004597235
## [1] 0.002196956
## [1] 0.002015107
## [1] 0.002496612
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 5
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005547865
## [1] 0.002994156
## [1] 0.001823028
## [1] 0.002511824
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 6
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.00471737
## [1] 0.002323473
## [1] 0.00152028
## [1] 0.00206841
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 7
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004640662
## [1] 0.002265562
## [1] 0.002637281
## [1] 0.003028979
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 8
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004751371
## [1] 0.002334002
## [1] 0.001571872
## [1] 0.002123473
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 9
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004536413
## [1] 0.002096663
## [1] 0.001106063
## [1] 0.001357692
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 10
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.00424201
## [1] 0.001385648
## [1] 0.0007296178
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 11
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004159753
## [1] 0.001882673
## [1] 0.001675138
## [1] 0.00212326
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 12
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004269302
## [1] 0.001959428
## [1] 0.00267937
## [1] 0.002906533
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 13
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004772533
## [1] 0.002340347
## [1] 0.002208179
## [1] 0.002715199
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 14
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004664102
## [1] 0.001201669
## [1] 0.0007497667
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 15
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004363746
## [1] 0.002013022
## [1] 0.001216025
## [1] 0.001807108
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 16
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004571076
## [1] 0.002422267
## [1] 0.003608208
## [1] 0.003946688
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 17
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004152306
## [1] 0.001905675
## [1] 0.002089056
## [1] 0.002558118
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 18
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003649308
## [1] 0.001259969
## [1] 0.00104714
## [1] 0.001412601
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 19
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003979101
## [1] 0.001780051
## [1] 0.001298679
## [1] 0.001807178
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 20
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003537082
## [1] 0.00146439
## [1] 0.001389145
## [1] 0.001772284
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 21
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004986264
## [1] 0.001151158
## [1] 0.0008174939
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 22
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004631683
## [1] 0.002214069
## [1] 0.002377576
## [1] 0.002798655
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 23
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003066756
## [1] 0.001126385
## [1] 0.001409774
## [1] 0.001613505
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 24
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004076159
## [1] 0.001866704
## [1] 0.002101919
## [1] 0.002624584
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 25
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003616447
## [1] 0.001452421
## [1] 0.001132568
## [1] 0.001518411
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 26
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004219319
## [1] 0.002010245
## [1] 0.001974118
## [1] 0.002458454
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 27
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004412387
## [1] 0.001564638
## [1] 0.0006919762
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 28
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.002030515
## [1] 0.0009946138
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 29
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004971709
## [1] 0.002497243
## [1] 0.00356292
## [1] 0.004017098
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 30
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004148087
## [1] 0.002074636
## [1] 0.003306652
## [1] 0.003595068
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 31
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004058778
## [1] 0.001821058
## [1] 0.002452984
## [1] 0.00276532
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 32
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003605015
## [1] 0.001404906
## [1] 0.001254988
## [1] 0.001608153
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 33
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004371892
## [1] 0.002025162
## [1] 0.001673002
## [1] 0.002163652
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 34
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.001947918
## [1] 0.0009276869
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 35
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004051784
## [1] 0.00186273
## [1] 0.002712085
## [1] 0.003084828
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 36
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003547857
## [1] 0.00149191
## [1] 0.002332382
## [1] 0.002622667
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 37
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005424021
## [1] 0.002940651
## [1] 0.00238355
## [1] 0.003131738
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 38
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004487629
## [1] 0.002120229
## [1] 0.002720539
## [1] 0.003134397
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 39
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005884391
## [1] 0.003196969
## [1] 0.00342038
## [1] 0.004099628
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 40
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004579889
## [1] 0.002172739
## [1] 0.002448184
## [1] 0.002862934
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 41
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004412343
## [1] 0.002079192
## [1] 0.002336572
## [1] 0.002715363
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 42
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004763955
## [1] 0.002353705
## [1] 0.002791948
## [1] 0.003331998
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 43
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003559414
## [1] 0.001429145
## [1] 0.001053754
## [1] 0.001427752
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 44
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005014635
## [1] 0.002575554
## [1] 0.002400071
## [1] 0.003002924
## [1] "Done"
## [1] "Start normalising input probes"
AMD_Epigenetic_Age_PedBE <- cbind(AMD_Epigenetic_Age_PedBE, PedBE_DNAmAge$DNAmAge....datMethClock.DNAmAgeBuccal)
colnames(AMD_Epigenetic_Age_PedBE)[6] <- "DNAmAgePedBE"

AMD_Epigenetic_Age_PedBE$DNAmPedBEAgeAccelResidual <- lm(DNAmAgePedBE ~ Age, 
                                                          data = AMD_Epigenetic_Age_PedBE)$residuals

Step 17: PedBE clock.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_PedBE$Sex, shape = AMD_Epigenetic_Age_PedBE$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_PedBE_plot <- lm(DNAmAgePedBE ~ Age, data = AMD_Epigenetic_Age_PedBE)
ggplotRegression(AMD_PedBE_plot) + labs(x = "Reported age", y = "DNAmPedBE predicted age", caption = "Comparing reported age and predicted age using PedBE epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_PedBE$Sample_Group <- factor(AMD_Epigenetic_Age_PedBE$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_PedBE, 
       aes(x = Sample_Group, y = DNAmPedBEAgeAccelResidual, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmPedBE Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_PedBE$Sex <- factor(AMD_Epigenetic_Age_PedBE$Sex)
ggplot(data = AMD_Epigenetic_Age_PedBE, 
       aes(x = Sample_Group, y = DNAmPedBEAgeAccelResidual, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmPedBE Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_PedBE))
## 
## Call:
## lm(formula = DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex + 
##     Sex * Sample_Group, data = AMD_Epigenetic_Age_PedBE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3188 -1.0868  0.2439  0.9448  2.6585 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)             -0.59339    0.44031  -1.348   0.1854  
## Sample_Groupnormal      -0.07251    0.68617  -0.106   0.9164  
## SexM                     1.02608    0.56844   1.805   0.0786 .
## Sample_Groupnormal:SexM -0.01812    0.87272  -0.021   0.9835  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.392 on 40 degrees of freedom
## Multiple R-squared:  0.1226, Adjusted R-squared:  0.05681 
## F-statistic: 1.863 on 3 and 40 DF,  p-value: 0.1513
summary(aov(DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_PedBE))
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## Sample_Group      1   0.03   0.029   0.015 0.9038  
## Sex               1  10.81  10.808   5.575 0.0232 *
## Sample_Group:Sex  1   0.00   0.001   0.000 0.9835  
## Residuals        40  77.55   1.939                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 18: GrimAge clock.

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_Age_Grim <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmGrimAge", "AgeAccelGrim")] 

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Grim$Sex, shape = AMD_Epigenetic_Age_Grim$Sample_Group, size = 1)) +
  guides(size = FALSE) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Grim predicted DNAmAge against reported age:
AMD_Grim_plot <- lm(DNAmGrimAge ~ Age, data = AMD_Epigenetic_Age_Grim)
ggplotRegression(AMD_Grim_plot) + labs(x = "Reported age", y = "DNAmGrimAge predicted age", caption = "Comparing reported age and predicted age using GrimAge epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab"), labels = c("Female", "Male")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Grim$Sample_Group <- factor(AMD_Epigenetic_Age_Grim$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Grim, 
       aes(x = Sample_Group, y = AgeAccelGrim, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmGrimAge Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Grim$Sex <- factor(AMD_Epigenetic_Age_Grim$Sex)
ggplot(data = AMD_Epigenetic_Age_Grim, 
       aes(x = Sample_Group, y = AgeAccelGrim, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmGrimAge Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelGrim ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Grim))
## 
## Call:
## lm(formula = AgeAccelGrim ~ Sample_Group + Sex + Sex * Sample_Group, 
##     data = AMD_Epigenetic_Age_Grim)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2295 -1.1402  0.1506  1.6160  4.2953 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             -1.83212    0.67959  -2.696  0.01022 * 
## Sample_Groupnormal       0.77921    1.05906   0.736  0.46617   
## SexM                     2.47190    0.87734   2.817  0.00748 **
## Sample_Groupnormal:SexM -0.07774    1.34698  -0.058  0.95426   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.149 on 40 degrees of freedom
## Multiple R-squared:  0.2721, Adjusted R-squared:  0.2175 
## F-statistic: 4.984 on 3 and 40 DF,  p-value: 0.004958
summary(aov(AgeAccelGrim ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Grim))
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## Sample_Group      1   7.05    7.05   1.527 0.223816    
## Sex               1  61.99   61.99  13.422 0.000721 ***
## Sample_Group:Sex  1   0.02    0.02   0.003 0.954264    
## Residuals        40 184.73    4.62                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 19: Calculate epigenetic age for Weidner clock.

The Weidner clock is developed from an initial 82 blood samples, which are then used to generate a multivariate model. Use the beta values at the following three CpG sites ((α) cg02228185; (β) cg25809905, and (γ) a CpG site upstream of cg17861230) to generate age-prediction, based on the equation from the model below.

Predicted age (in years) = 38.0 - 26.4α - 23.7β + 164.7γ

AMD_Epigenetic_Age_Weidner <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
rownames(AMD_Epigenetic_Age_Weidner) <- AMD_Epigenetic_Age_Weidner$Sample_Name

AMD_betas.Weidner <- getBeta(AMD_MSet.noob)

#Subset the 3 CpG sites specified by the clock. 
Weidner_sites <- AMD_betas.Weidner[rownames(AMD_betas.Weidner) %in% c("cg02228185", "cg25809905", "cg17861230"),]
Weidner_sites <- as.data.frame(t(Weidner_sites))


#Check sample order. 
identical(rownames(AMD_Epigenetic_Age_Weidner), rownames(Weidner_sites)) #TRUE.
## [1] TRUE
#Calculate Weidner epigenetic age.
AMD_Epigenetic_Age_Weidner$DNAmAgeWeidner <- 38.0 - 26.4*(Weidner_sites$cg02228185) - 23.7*(Weidner_sites$cg25809905) + 164.7*(Weidner_sites$cg17861230)

#Calculate Weidner age acceleration residual. 
AMD_Epigenetic_Age_Weidner$AgeAccelResidualWeidner <- lm(DNAmAgeWeidner ~ Age, data = AMD_Epigenetic_Age_Weidner)$residual

Step 20: Weidner clock.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_Age_Weidner$Sex, shape = AMD_Epigenetic_Age_Weidner$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_Weidner_plot <- lm(DNAmAgeWeidner ~ Age, data = AMD_Epigenetic_Age_Weidner)
ggplotRegression(AMD_Weidner_plot) + labs(x = "Reported age", y = "DNAmWeidner predicted age", caption = "Comparing reported age and predicted age using DNAmWeidner epigenetic clock") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Weidner$Sample_Group <- factor(AMD_Epigenetic_Age_Weidner$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Weidner, 
       aes(x = Sample_Group, y = AgeAccelResidualWeidner, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmWeidner Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Weidner$Sex <- factor(AMD_Epigenetic_Age_Weidner$Sex)
ggplot(data = AMD_Epigenetic_Age_Weidner, 
       aes(x = Sample_Group, y = AgeAccelResidualWeidner, fill = Sex)) + 
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmWeidner Acceleration Residual") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("AMD", "Normal")) + 
  scale_fill_manual(values = c("#990000", "#2c7dab")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"))

#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Weidner))
## 
## Call:
## lm(formula = AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex * 
##     Sample_Group, data = AMD_Epigenetic_Age_Weidner)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5349  -8.6345   0.1477   6.4664  26.0141 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)               -3.816      3.612  -1.057    0.297
## Sample_Groupnormal         1.397      5.628   0.248    0.805
## SexM                       7.108      4.663   1.524    0.135
## Sample_Groupnormal:SexM   -4.214      7.159  -0.589    0.559
## 
## Residual standard error: 11.42 on 40 degrees of freedom
## Multiple R-squared:  0.06317,    Adjusted R-squared:  -0.007092 
## F-statistic: 0.8991 on 3 and 40 DF,  p-value: 0.4501
summary(aov(AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Weidner))
##                  Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group      1     12   11.67   0.089  0.766
## Sex               1    295  294.96   2.261  0.141
## Sample_Group:Sex  1     45   45.20   0.346  0.559
## Residuals        40   5218  130.44

Step 21: Summary of epigenetic clock outputs.

#Combine all epigenetic clock outputs into a summary table. 
AMD_Epigenetic_All <- join_all(list(AMD_Epigenetic_Age_Horvath,
                                    AMD_Epigenetic_Age_Zhang,
                                    AMD_Epigenetic_Age_Hannum,
                                    AMD_Epigenetic_AgeSkinBlood,
                                    AMD_Epigenetic_Age_Cortical,
                                    AMD_Epigenetic_Age_PedBE,
                                    AMD_Epigenetic_Age_Weidner), 
                               by = "Sample_Name", type = "left")
AMD_Epigenetic_All <- AMD_Epigenetic_All[, !duplicated(colnames(AMD_Epigenetic_All))]

#Change column names to reflect outputs of various clocks. 
colnames(AMD_Epigenetic_All)[10:26] <- c("HorvathAge", "HorvathAgeAccelDiff", "HorvathAgeAccelResidual",
                                         "ZhangENAge", "ZhangENAgeAccelResidual", "ZhangBLUPAge", 
                                         "ZhangBLUPAgeAccelResidual", "HannumAge", "HannumAgeAccelResidual",
                                         "SkinBloodAge", "SkinBloodAgeAccelResidual", "CorticalAge", 
                                         "CorticalAgeAccelResidual", "PedBEAge", "PedBEAgeAccelResidual", 
                                         "WeidnerAge", "WeidnerAgeAccelResidual")

Plot all epigenetic clock outputs.

#Melt wide form data to long form.
AMD_clocks.all <- AMD_Epigenetic_All[, c(1:10, 21, 17, 13, 15, 25, 19)]
AMD_clocks.melt <- melt(AMD_clocks.all)
## Warning in melt(AMD_clocks.all): The melt generic in data.table has been
## passed a data.frame and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(AMD_clocks.all). In the next version, this warning will become an
## error.
## Using Sample_Name, Sample_Group, Array, Slide, Sex, predictedGender, Tissue, predictedTissue as id variables
colnames(AMD_clocks.melt)[c(9:10)] <- c("Epigenetic_clocks", "Age")

#Plot outputs together.
ggplot(data = AMD_clocks.melt, aes(x = Epigenetic_clocks, y = Age, fill = Epigenetic_clocks)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  xlab("\nEpigenetic Clocks Outputs") + ylab(element_blank()) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")

#Calculate RMSE.
rmse(AMD_clocks.all$Age, AMD_clocks.all$HorvathAge)
## [1] 41.21593
rmse(AMD_clocks.all$Age, AMD_clocks.all$HannumAge)
## [1] 74.5432
rmse(AMD_clocks.all$Age, AMD_clocks.all$CorticalAge)
## [1] 74.32903
rmse(AMD_clocks.all$Age, AMD_clocks.all$ZhangBLUPAge)
## [1] 74.27721
rmse(AMD_clocks.all$Age, AMD_clocks.all$ZhangENAge)
## [1] 20.60796
rmse(AMD_clocks.all$Age, AMD_clocks.all$SkinBloodAge)
## [1] 74.34481
rmse(AMD_clocks.all$Age, AMD_clocks.all$WeidnerAge)
## [1] 43.93984

Check correlation between predicted ages of various clocks.

#Create correlation matrix.
AMD_clocks.cor <- AMD_clocks.all[, c(9:16)]
AMD_clocks.cor <- round(cor(AMD_clocks.cor), 2)

#Plot paired matrix.
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following object is masked from 'package:genefilter':
## 
##     AUC
## The following object is masked from 'package:ROC':
## 
##     AUC
## The following object is masked from 'package:IRanges':
## 
##     reflect
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
pairs.panels(AMD_clocks.cor, 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = FALSE,  # show density plots
             ellipses = FALSE # show correlation ellipses
             )

For Horvath’s clock and Cortical Clock, bin the data to see progression of age acceleration.

#Look at age and predicted age distributions. 
summary(AMD_Epigenetic_All$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.00   70.00   74.00   73.66   79.00   89.00
summary(AMD_Epigenetic_All$HorvathAge)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.32   28.86   33.66   33.09   36.81   50.76
summary(AMD_Epigenetic_All$CorticalAge)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -13.6134  -3.9075   0.8102   0.0000   4.0725  14.7770
#Quickly visualise age distribution. 
hist(AMD_Epigenetic_All$Age, main = "Chronological Age Distribution", xlab = "Chronological Age")

#Horvath's clock:
#Bin epigenetic age acceleration difference according to chronological age.
#Define bucket intervals and name them.
tags <- c("Under 60", "60-69", "70-79", "Above 80")

#Place chronological ages into the buckets. Store groups as a new column.
v <- AMD_Epigenetic_All %>% select(Age, HorvathAgeAccelResidual) #pick the variable 
vgroup <- as_tibble(v) %>% mutate(tag = case_when(
    Age < 60 ~ tags[1],
    Age >= 60 & Age < 70 ~ tags[2],
    Age >= 70 & Age < 80 ~ tags[3],
    Age >= 80 ~ tags[4]
    ))
summary(vgroup)
##       Age        HorvathAgeAccelResidual     tag           
##  Min.   :50.00   Min.   :-14.08932       Length:44         
##  1st Qu.:70.00   1st Qu.: -3.51174       Class :character  
##  Median :74.00   Median : -0.07837       Mode  :character  
##  Mean   :73.66   Mean   :  0.00000                         
##  3rd Qu.:79.00   3rd Qu.:  3.32197                         
##  Max.   :89.00   Max.   : 15.09633
#Factorise bins
vgroup$tag <- factor(vgroup$tag, levels = tags, ordered = FALSE)
summary(vgroup$tag)
## Under 60    60-69    70-79 Above 80 
##        2        8       24       10
#Visualise the binned data.
ggplot(data = vgroup, mapping = aes(x = tag, y = HorvathAgeAccelResidual)) + 
  geom_jitter(aes(color = 'blue'), alpha = 0.2) +
  geom_boxplot(fill = "#BB9D00", color = "black", alpha = 0.3) + 
  labs(x = "Chronological Age Deciles", y = "Horvath's Clock Age Acceleration") +
  guides(color = FALSE) +
  theme_minimal() 

#Cortical clock:
#Bin epigenetic age acceleration difference according to chronological age.
#Calculate cortical clock age acceleration difference.
AMD_Epigenetic_All$CorticalAgeAccelDiff <- AMD_Epigenetic_All$CorticalAge - AMD_Epigenetic_All$Age

#Define bucket intervals and name them.
tags <- c("Under 60", "60-69", "70-79", "Above 80")

#Place chronological ages into the buckets. Store groups as a new column.
v <- AMD_Epigenetic_All %>% select(Age, CorticalAgeAccelResidual) #pick the variable 
vgroup <- as_tibble(v) %>% mutate(tag = case_when(
    Age < 60 ~ tags[1],
    Age >= 60 & Age < 70 ~ tags[2],
    Age >= 70 & Age < 80 ~ tags[3],
    Age >= 80 ~ tags[4]
    ))
summary(vgroup)
##       Age        CorticalAgeAccelResidual     tag           
##  Min.   :50.00   Min.   :-10.55257        Length:44         
##  1st Qu.:70.00   1st Qu.: -3.90748        Class :character  
##  Median :74.00   Median :  0.81019        Mode  :character  
##  Mean   :73.66   Mean   :  0.03524                          
##  3rd Qu.:79.00   3rd Qu.:  4.07247                          
##  Max.   :89.00   Max.   : 12.17295
#Factorise bins
vgroup$tag <- factor(vgroup$tag, levels = tags, ordered = FALSE)
summary(vgroup$tag)
## Under 60    60-69    70-79 Above 80 
##        2        8       24       10
#Visualise the binned data.
ggplot(data = vgroup, mapping = aes(x = tag, y = CorticalAgeAccelResidual)) + 
  geom_jitter(aes(color = 'blue'), alpha = 0.2) +
  geom_boxplot(fill = "#00B81F", color = "black", alpha = 0.3) + 
  labs(x = "Chronological Age Deciles", y = "Cortical Clock Age Acceleration") +
  guides(color = FALSE) +
  theme_minimal()

Step 22: Age-associated CpG sites overlap.

library(VennDiagram)
library(grid)

#Load epigenetic clock CpGs.
Epigenetic_clocks_CpGs <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Epigenetic_clocks_CpGs.csv",
                                     header = TRUE)
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs==""] <- NA

DNAmHorvath <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHorvath))
DNAmHannum <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHannum))
DNAmCortical <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmCortical))
DNAmSkinBlood <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmSkinBlood))
DNAmZhang <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmZhang))
DNAmWeidner <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmWeidner))

#Check similarity between DNAmWeidner CpGs with the other clocks.
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHorvath %in% DNAmWeidner,]$DNAmHorvath
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHannum %in% DNAmWeidner,]$DNAmHannum
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmCortical %in% DNAmWeidner,]$DNAmCortical
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmSkinBlood %in% DNAmWeidner,]$DNAmSkinBlood
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmZhang %in% DNAmWeidner,]$DNAmZhang
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmPhenoAge %in% DNAmWeidner,]$DNAmPhenoAge

#Make Venn diagram.
x <- list(DNAmHorvath, DNAmHannum, DNAmZhang, DNAmSkinBlood, DNAmCortical)

setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/plots/")
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
venn_cpg <- venn.diagram(x = x, 
             category.names = c("DNAmHorvath" , "DNAmHannum", "DNAmZhang", "DNAmSkinBlood", "DNAmCortical"), 
             filename = NULL,
             cat.cex = 0.8, cat.fontface = "bold", cat.default.pos = "inner", 
             cat.pos = c(10, 25, 55, 185, 190),
             cat.dist = c(0.08, 0.08, 0.08, 0.08, 0.08),
             scale = FALSE
             )
grid.newpage()
grid.draw(venn_cpg)

Find overlap between age-associated CpGs of Horvath’s and Cortical clocks.

library(VennDiagram)
## Loading required package: futile.logger
## Warning: package 'futile.logger' was built under R version 3.6.3
## 
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
## 
##     scat
## 
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:ape':
## 
##     rotate
## The following object is masked from 'package:dendextend':
## 
##     rotate
library(grid)

#Load epigenetic clock CpGs.
Epigenetic_clocks_CpGs <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Epigenetic_clocks_CpGs.csv",
                                     header = TRUE)
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs==""] <- NA

DNAmHorvath <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHorvath))
DNAmCortical <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmCortical))

#Make Venn diagram.
x <- list(DNAmHorvath, DNAmCortical)

setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/plots/")
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
## NULL
venn_overlap <- venn.diagram(x = x, 
             category.names = c("DNAmHorvath" , "DNAmCortical"), 
             filename = NULL,
             cat.cex = 1, cat.fontface = "bold", cat.default.pos = "outer", 
             cat.pos = c(-25, 25),
             cat.dist = c(0.04, 0.04),
             scaled = FALSE
             )
grid.newpage()
grid.draw(venn_overlap) #5 overlapping CpGs. 

#Find out what the overlapping CpGs are. 
overlap_cpgs <- intersect(DNAmHorvath, DNAmCortical)

#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")

EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name %in% overlap_cpgs),]
##                  Name AddressA_ID
## cg21801378 cg21801378  0035637875
## cg23124451 cg23124451  0092806941
## cg19724470 cg19724470  0086696312
## cg08370996 cg08370996  0076666492
## cg06144905 cg06144905  0084705969
##                                              AlleleA_ProbeSeq AddressB_ID
## cg21801378 TAATAACATCATAATCCTTCATAAATACAACAAAACCAAAATTTAACCCA  0091713156
## cg23124451 AAAATACTAATAACRACCRAACCTACCTAAACTAAAAATTAAAATCRAAC            
## cg19724470 ACTTACACAAATCATATTACTAACAAACAACTAACRATAAACTTAAACCC            
## cg08370996 CCCCCTTTTTAACATATTTAATCACTTTAATTCTCTATTCTTTTCTCTCC            
## cg06144905 TAATTTTCCCCTCTTCAACCTAATTTACTTTAAAAATTCTAAAAAACCCC            
##                                              AlleleB_ProbeSeq
## cg21801378 TAATAACGTCGTAATCCTTCATAAATACGACGAAACCGAAATTTAACCCG
## cg23124451                                                   
## cg19724470                                                   
## cg08370996                                                   
## cg06144905                                                   
##            Infinium_Design_Type Next_Base Color_Channel
## cg21801378                    I         C           Grn
## cg23124451                   II                        
## cg19724470                   II                        
## cg08370996                   II                        
## cg06144905                   II                        
##                                                                                                                        Forward_Sequence
## cg21801378 CCACGAAGAGCTTGATGGCGTCGTGGTCCTTCATGGGTACGGCGGGACCGGGGTTTAGCC[CG]CTCATGCCGACGCCGCTGTCCGCGGTGCTGAAACCCAGGCGCGGGCCGGGGCCAGCGGGC
## cg23124451 TCAGTCTCCCCATATTTACAATAAAAGGGGAGCGAGGTGGGATGGCGCTGAGGATCCCTA[CG]TCCGATCCTAATCTCCAGCTCAGGCAGGCTCGGCCGCCACTAGCATCCTGGAGCGACAAC
## cg19724470 CATTCTTATGCGACTGTGTGTTCAGAATATAGCTCTGATGCTAGGCTGGAGGTCTGGACA[CG]GGTCCAAGTCCACCGCCAGCTGCTTGCTAGTAACATGACTTGTGTAAGTTATCCCAGCTG
## cg08370996 CCCTCCCGCGCCCCCCTTTTTAGCATATTTGATCACTTTGATTCTCTGTTCTTTTCTCTC[CG]CGGTGTGTGTGTGCGTGCGCGCGTGTGTGTTTTCTTCTTCTCCTCCTCCTCTCCCCGAGT
## cg06144905 CTGACCTCACCACCCACCAGGGAGGTGGGTCTTATTCTGGGCATCGTGCCAAGTTCTTAG[CG]GGGCCCTCTAGAATCTCTAAAGCAAATCAGGCTGAAGAGGGGAAAACCAGCAGGGGGAGG
##            Genome_Build CHR  MAPINFO
## cg21801378           37  15 72612125
## cg23124451           37  22 39548131
## cg19724470           37   9  5450936
## cg08370996           37  15 96874031
## cg06144905           37  17 27369780
##                                                     SourceSeq Strand
## cg21801378 CGGGCTAAACCCCGGTCCCGCCGTACCCATGAAGGACCACGACGCCATCA      R
## cg23124451 CGTCCGATCCTAATCTCCAGCTCAGGCAGGCTCGGCCGCCACTAGCATCC      F
## cg19724470 CGGGTCCAAGTCCACCGCCAGCTGCTTGCTAGTAACATGACTTGTGTAAG      F
## cg08370996 CCCCTTTTTAGCATATTTGATCACTTTGATTCTCTGTTCTTTTCTCTCCG      R
## cg06144905 CGGGGCCCTCTAGAATCTCTAAAGCAAATCAGGCTGAAGAGGGGAAAACC      F
##            UCSC_RefGene_Name UCSC_RefGene_Accession UCSC_RefGene_Group
## cg21801378           BRUNOL6              NM_052840            1stExon
## cg23124451              CBX7              NM_175709               Body
## cg19724470             CD274              NM_014143              5'UTR
## cg08370996       NR2F2;NR2F2 NM_021005;NM_001145155        TSS200;Body
## cg06144905             PIPOX              NM_016518             TSS200
##              UCSC_CpG_Islands_Name Relation_to_UCSC_CpG_Island
## cg21801378 chr15:72611946-72612802                      Island
## cg23124451 chr22:39548102-39548432                      Island
## cg19724470    chr9:5450409-5450629                     S_Shore
## cg08370996 chr15:96873408-96877721                      Island
## cg06144905                                                    
##                    Phantom4_Enhancers Phantom5_Enhancers DMR X450k_Enhancer
## cg21801378                                                               NA
## cg23124451                                                               NA
## cg19724470                                                               NA
## cg08370996 high-CpG:94674791-94675270                                    NA
## cg06144905                                                               NA
##                      HMM_Island Regulatory_Feature_Name
## cg21801378 15:70399042-70400040    15:72611781-72613209
## cg23124451 22:37877959-37879580                        
## cg19724470                            9:5449680-5451039
## cg08370996 15:94674413-94676109    15:96873561-96874038
## cg06144905                                             
##                         Regulatory_Feature_Group
## cg21801378                   Promoter_Associated
## cg23124451                                      
## cg19724470                   Promoter_Associated
## cg08370996 NonGene_Associated_Cell_type_specific
## cg06144905                                      
##                                         GencodeBasicV12_NAME
## cg21801378 CELF6;CELF6;CELF6;CELF6;RP11-106M3.3;RP11-106M3.2
## cg23124451                                                  
## cg19724470                                       CD274;CD274
## cg08370996                                       NR2F2;NR2F2
## cg06144905                                             PIPOX
##                                                                                              GencodeBasicV12_Accession
## cg21801378 ENST00000539635.1;ENST00000567083.1;ENST00000287202.5;ENST00000539635.1;ENST00000570175.1;ENST00000379915.4
## cg23124451                                                                                                            
## cg19724470                                                                         ENST00000381573.4;ENST00000381577.3
## cg08370996                                                                         ENST00000394166.3;ENST00000394166.3
## cg06144905                                                                                           ENST00000323372.4
##                                GencodeBasicV12_Group
## cg21801378 1stExon;1stExon;1stExon;5'UTR;5'UTR;3'UTR
## cg23124451                                          
## cg19724470                               5'UTR;5'UTR
## cg08370996                             5'UTR;1stExon
## cg06144905                                    TSS200
##                                                GencodeCompV12_NAME
## cg21801378 CELF6;CELF6;CELF6;CELF6;CELF6;RP11-106M3.3;RP11-106M3.2
## cg23124451                                               CBX7;CBX7
## cg19724470                                             CD274;CD274
## cg08370996                                             NR2F2;NR2F2
## cg06144905                                       PIPOX;PIPOX;PIPOX
##                                                                                                                 GencodeCompV12_Accession
## cg21801378 ENST00000539635.1;ENST00000567083.1;ENST00000569547.1;ENST00000287202.5;ENST00000539635.1;ENST00000570175.1;ENST00000379915.4
## cg23124451                                                                                           ENST00000477827.1;ENST00000482294.1
## cg19724470                                                                                           ENST00000381573.4;ENST00000381577.3
## cg08370996                                                                                           ENST00000394166.3;ENST00000394166.3
## cg06144905                                                                         ENST00000323372.4;ENST00000466889.1;ENST00000469082.1
##                                         GencodeCompV12_Group
## cg21801378 1stExon;1stExon;1stExon;1stExon;5'UTR;5'UTR;3'UTR
## cg23124451                                       3'UTR;3'UTR
## cg19724470                                       5'UTR;5'UTR
## cg08370996                                     5'UTR;1stExon
## cg06144905                            TSS200;TSS1500;TSS1500
##            DNase_Hypersensitivity_NAME DNase_Hypersensitivity_Evidence_Count
## cg21801378     chr15:72611880-72612415                                     3
## cg23124451                                                                NA
## cg19724470        chr9:5450860-5451110                                     3
## cg08370996     chr15:96873300-96875030                                     3
## cg06144905     chr17:27369445-27370195                                     3
##                 OpenChromatin_NAME OpenChromatin_Evidence_Count
## cg21801378                                                   NA
## cg23124451 chr22:39546864-39549854                            6
## cg19724470                                                   NA
## cg08370996                                                   NA
## cg06144905                                                   NA
##                          TFBS_NAME TFBS_Evidence_Count Methyl27_Loci
## cg21801378                                          NA          TRUE
## cg23124451 chr22:39547710-39549680                   6          TRUE
## cg19724470                                          NA          TRUE
## cg08370996                                          NA          TRUE
## cg06144905                                          NA          TRUE
##            Methyl450_Loci Chromosome_36 Coordinate_36
## cg21801378           TRUE            15      70399179
## cg23124451           TRUE            22      37878077
## cg19724470           TRUE             9       5440936
## cg08370996           TRUE            15      94675035
## cg06144905           TRUE            17      24393906
##                                                                          SNP_ID
## cg21801378          rs115738898;rs141628783;rs150517327;rs368237855;rs574078216
## cg23124451                                                                     
## cg19724470 rs185005359;rs866066;rs559393240;rs532929011;rs551514059;rs190049644
## cg08370996                                                                     
## cg06144905                                  rs202118547;rs570473003;rs536095412
##                 SNP_DISTANCE
## cg21801378     30;29;27;14;1
## cg23124451                  
## cg19724470 12;17;23;33;34;40
## cg08370996                  
## cg06144905             1;1;2
##                                         SNP_MinorAlleleFrequency Random_Loci  X
## cg21801378          0.010982;0.000246;0.000247;0.000200;0.000200          NA NA
## cg23124451                                                                NA NA
## cg19724470 0.000200;0.139477;0.000799;0.000200;0.000200;0.000200          NA NA
## cg08370996                                                                NA NA
## cg06144905                            0.016374;0.000200;0.001198          NA NA
##            strand    Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs SBE_maf CH_450_XY
## cg21801378      - rs115738898  0.010982   <NA>      NA   <NA>      NA        No
## cg23124451      +        <NA>        NA   <NA>      NA   <NA>      NA        No
## cg19724470      +    rs866066  0.278954   <NA>      NA   <NA>      NA        No
## cg08370996      -        <NA>        NA   <NA>      NA   <NA>      NA        No
## cg06144905      +        <NA>        NA   <NA>      NA   <NA>      NA        No
##            CH_450_Aut CH_EPIC Cross_Hyb
## cg21801378         No      No        No
## cg23124451         No      No        No
## cg19724470         No      No        No
## cg08370996         No      No        No
## cg06144905         No      No        No
EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name %in% overlap_cpgs),]$UCSC_RefGene_Name
## [1] BRUNOL6     CBX7        CD274       NR2F2;NR2F2 PIPOX      
## 66070 Levels:  A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3

Find overlap between age-associated clock CpGs and EWAS hits.

#Load epigenetic clock CpGs.
EWAS_hits <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EWAS_hits.csv", header = TRUE)

#Find overlap.
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHorvath %in% EWAS_hits$EWAS_hits,]$DNAmHorvath
## factor(0)
## 354 Levels:  cg00075967 cg00091693 cg00168942 cg00374717 ... cg27544190
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHannum %in% EWAS_hits$EWAS_hits,]$DNAmHannum
## factor(0)
## 72 Levels:  cg00481951 cg00486113 cg00748589 cg01528542 ... ch.2.30415474F
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmCortical %in% EWAS_hits$EWAS_hits,]$DNAmCortical
## factor(0)
## 348 Levels:  cg00059225 cg00088042 cg00252534 cg00297950 ... ch.2.71774667F
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmSkinBlood %in% EWAS_hits$EWAS_hits,]$DNAmSkinBlood
## factor(0)
## 392 Levels:  cg00017842 cg00073460 cg00194146 cg00257455 ... cg27544190
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmZhang %in% EWAS_hits$EWAS_hits,]$DNAmZhang
## [1] cg22197050
## 514 Levels: cg00096065 cg00241664 cg00271522 cg00302979 ... cg27640528
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmPhenoAge %in% EWAS_hits$EWAS_hits,]$DNAmPhenoAge
## factor(0)
## 514 Levels:  cg00079056 cg00083937 cg00113951 cg00168942 ... cg27655905
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmWeidner %in% EWAS_hits$EWAS_hits,]$DNAmWeidner
## factor(0)
## Levels:  cg02228185 cg17861230 cg25809905

Step 23: Prepare data for DNAmTL.

DNAmTL is a methylation estimator for telomere length based on 140CpGs. DNAmTL is said to be not only an epigenetic biomarker of replicative history of cells, but also a useful marker for age-related pathologies that are associated with it.

This script is used to do telomere length prediction based on Illumina 450K and EPIC DNA methylation data. Coefficients of the predictor are based on training samples.

#Subset relevant epigenetic age variables. 
AMD_Epigenetic_DNAmTL <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmTL")] 

Step 24: DNAmTL predictor.

#Create function to plot linear regression.
ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point(aes(colour = AMD_Epigenetic_DNAmTL$Sex, shape = AMD_Epigenetic_DNAmTL$Sample_Group)) +
  stat_smooth(method = "lm", col = "red") 
}

#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_DNAmTL_plot <- lm(DNAmTL ~ Age, data = AMD_Epigenetic_DNAmTL)
ggplotRegression(AMD_DNAmTL_plot) + labs(x = "Reported age", y = "DNAm telomere length", caption = "Correlation between chronological age and predicted telomere length") + 
  scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
  scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
  stat_cor(method = "spearman") +
  theme_bw() + theme(panel.border = element_blank(), 
                     panel.grid.major = element_blank(), 
                     panel.grid.minor = element_blank(), 
                     axis.line = element_line(colour = "black"), 
                     axis.text = element_text(), 
                     axis.title = element_text(size = 8), 
                     plot.caption = element_text(hjust = 0.5, size = rel(1)))
## `geom_smooth()` using formula 'y ~ x'

#Plot of overall disease state vs control:
AMD_Epigenetic_DNAmTL$Sample_Group <- factor(AMD_Epigenetic_DNAmTL$Sample_Group)
ggplot(data = AMD_Epigenetic_DNAmTL, 
       aes(x = Sample_Group, y = DNAmTL, fill = Sample_Group)) +
  geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
  geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7, 
              position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) + 
  xlab("Disease State") + ylab("DNAmTL Prediction") + 
  scale_x_discrete(breaks = c("age-related macular degeneration","normal"), 
                   labels = c("Age-Related Macular Degeneration", "Normal")) + 
  scale_fill_manual(values = c("#77284e", "#ffffff")) +
  theme_classic() + theme(
              panel.border = element_blank(),
              panel.background = element_blank(),
              plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
              text = element_text(family = "Tahoma"),
              axis.title = element_text(face = "bold"),
              axis.text.x = element_text(colour = "black", size = 11),
              axis.text.y = element_text(colour = "black", size = 9),
              axis.line = element_line(size = 0.5, colour = "black"),
              legend.position = "none")


References

Chen Y et al. (2013) Discovery of cross-reactive probes and polymorphic CpGs in the Illumina Infinium HumanMethylation450 microarray. Epigenetics.

Heiss J & Just JC (2019) Improved filtering of DNA methylation microarray data by detection p values and its impact on downstream anayses. Clinical Epigenetics.

Heiss J & Just JC (2018) Identifying mislabeled and contaminated DNA methylation microarray data: an extended quality control toolset with examples from GEO. Clinial Epigenetics.